Path: blob/main/sys/contrib/openzfs/lib/libzpool/kernel.c
106840 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2012, 2018 by Delphix. All rights reserved.24* Copyright (c) 2016 Actifio, Inc. All rights reserved.25* Copyright (c) 2025, Klara, Inc.26*/2728#include <assert.h>29#include <fcntl.h>30#include <libgen.h>31#include <poll.h>32#include <stdio.h>33#include <stdlib.h>34#include <string.h>35#include <limits.h>36#include <libzutil.h>37#include <sys/crypto/icp.h>38#include <sys/processor.h>39#include <sys/rrwlock.h>40#include <sys/spa.h>41#include <sys/spa_impl.h>42#include <sys/sid.h>43#include <sys/stat.h>44#include <sys/systeminfo.h>45#include <sys/time.h>46#include <sys/tsd.h>4748#include <libspl.h>49#include <libzpool.h>50#include <sys/zfs_context.h>51#include <sys/zfs_onexit.h>52#include <sys/zfs_vfsops.h>53#include <sys/zstd/zstd.h>54#include <sys/zvol.h>55#include <zfs_fletcher.h>56#include <zlib.h>5758/*59* Emulation of kernel services in userland.60*/6162uint32_t hostid;6364/* If set, all blocks read will be copied to the specified directory. */65char *vn_dumpdir = NULL;6667uint32_t68zone_get_hostid(void *zonep)69{70/*71* We're emulating the system's hostid in userland.72*/73(void) zonep;74return (hostid);75}7677/*78* =========================================================================79* vnode operations80* =========================================================================81*/8283/*84* =========================================================================85* Figure out which debugging statements to print86* =========================================================================87*/8889static char *dprintf_string;90static int dprintf_print_all;9192int93dprintf_find_string(const char *string)94{95char *tmp_str = dprintf_string;96int len = strlen(string);9798/*99* Find out if this is a string we want to print.100* String format: file1.c,function_name1,file2.c,file3.c101*/102103while (tmp_str != NULL) {104if (strncmp(tmp_str, string, len) == 0 &&105(tmp_str[len] == ',' || tmp_str[len] == '\0'))106return (1);107tmp_str = strchr(tmp_str, ',');108if (tmp_str != NULL)109tmp_str++; /* Get rid of , */110}111return (0);112}113114void115dprintf_setup(int *argc, char **argv)116{117int i, j;118119/*120* Debugging can be specified two ways: by setting the121* environment variable ZFS_DEBUG, or by including a122* "debug=..." argument on the command line. The command123* line setting overrides the environment variable.124*/125126for (i = 1; i < *argc; i++) {127int len = strlen("debug=");128/* First look for a command line argument */129if (strncmp("debug=", argv[i], len) == 0) {130dprintf_string = argv[i] + len;131/* Remove from args */132for (j = i; j < *argc; j++)133argv[j] = argv[j+1];134argv[j] = NULL;135(*argc)--;136}137}138139if (dprintf_string == NULL) {140/* Look for ZFS_DEBUG environment variable */141dprintf_string = getenv("ZFS_DEBUG");142}143144/*145* Are we just turning on all debugging?146*/147if (dprintf_find_string("on"))148dprintf_print_all = 1;149150if (dprintf_string != NULL)151zfs_flags |= ZFS_DEBUG_DPRINTF;152}153154/*155* =========================================================================156* debug printfs157* =========================================================================158*/159void160__dprintf(boolean_t dprint, const char *file, const char *func,161int line, const char *fmt, ...)162{163/* Get rid of annoying "../common/" prefix to filename. */164const char *newfile = zfs_basename(file);165166va_list adx;167if (dprint) {168/* dprintf messages are printed immediately */169170if (!dprintf_print_all &&171!dprintf_find_string(newfile) &&172!dprintf_find_string(func))173return;174175/* Print out just the function name if requested */176flockfile(stdout);177if (dprintf_find_string("pid"))178(void) printf("%d ", getpid());179if (dprintf_find_string("tid"))180(void) printf("%ju ",181(uintmax_t)(uintptr_t)pthread_self());182if (dprintf_find_string("cpu"))183(void) printf("%u ", getcpuid());184if (dprintf_find_string("time"))185(void) printf("%llu ", gethrtime());186if (dprintf_find_string("long"))187(void) printf("%s, line %d: ", newfile, line);188(void) printf("dprintf: %s: ", func);189va_start(adx, fmt);190(void) vprintf(fmt, adx);191va_end(adx);192funlockfile(stdout);193} else {194/* zfs_dbgmsg is logged for dumping later */195size_t size;196char *buf;197int i;198199size = 1024;200buf = umem_alloc(size, UMEM_NOFAIL);201i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);202203if (i < size) {204va_start(adx, fmt);205(void) vsnprintf(buf + i, size - i, fmt, adx);206va_end(adx);207}208209__zfs_dbgmsg(buf);210211umem_free(buf, size);212}213}214215/*216* =========================================================================217* cmn_err() and panic()218* =========================================================================219*/220221static __attribute__((noreturn)) void222panic_stop_or_abort(void)223{224const char *stopenv = getenv("LIBZPOOL_PANIC_STOP");225if (stopenv != NULL && atoi(stopenv)) {226fputs("libzpool: LIBZPOOL_PANIC_STOP is set, sending "227"SIGSTOP to process group\n", stderr);228fflush(stderr);229230kill(0, SIGSTOP);231232fputs("libzpool: continued after panic stop, "233"aborting\n", stderr);234}235236abort(); /* think of it as a "user-level crash dump" */237}238239static void240vcmn_msg(int ce, const char *fmt, va_list adx)241{242switch (ce) {243case CE_IGNORE:244return;245case CE_CONT:246break;247case CE_NOTE:248fputs("libzpool: NOTICE: ", stderr);249break;250case CE_WARN:251fputs("libzpool: WARNING: ", stderr);252break;253case CE_PANIC:254fputs("libzpool: PANIC: ", stderr);255break;256default:257fputs("libzpool: [unknown severity %d]: ", stderr);258break;259}260261vfprintf(stderr, fmt, adx);262if (ce != CE_CONT)263fputc('\n', stderr);264fflush(stderr);265}266267void268vcmn_err(int ce, const char *fmt, va_list adx)269{270vcmn_msg(ce, fmt, adx);271272if (ce == CE_PANIC)273panic_stop_or_abort();274}275276void277cmn_err(int ce, const char *fmt, ...)278{279va_list adx;280281va_start(adx, fmt);282vcmn_err(ce, fmt, adx);283va_end(adx);284}285286__attribute__((noreturn)) void287panic(const char *fmt, ...)288{289va_list adx;290291va_start(adx, fmt);292vcmn_msg(CE_PANIC, fmt, adx);293va_end(adx);294295panic_stop_or_abort();296}297298__attribute__((noreturn)) void299vpanic(const char *fmt, va_list adx)300{301vcmn_msg(CE_PANIC, fmt, adx);302panic_stop_or_abort();303}304305/*306* =========================================================================307* misc routines308* =========================================================================309*/310311void312delay(clock_t ticks)313{314(void) poll(0, 0, ticks * (1000 / hz));315}316317/*318* Find highest one bit set.319* Returns bit number + 1 of highest bit that is set, otherwise returns 0.320* The __builtin_clzll() function is supported by both GCC and Clang.321*/322int323highbit64(uint64_t i)324{325if (i == 0)326return (0);327328return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));329}330331/*332* Find lowest one bit set.333* Returns bit number + 1 of lowest bit that is set, otherwise returns 0.334* The __builtin_ffsll() function is supported by both GCC and Clang.335*/336int337lowbit64(uint64_t i)338{339if (i == 0)340return (0);341342return (__builtin_ffsll(i));343}344345int346ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)347{348errno = 0;349*result = strtoull(str, nptr, base);350if (*result == 0)351return (errno);352return (0);353}354355/*356* =========================================================================357* kernel emulation setup & teardown358* =========================================================================359*/360static int361umem_out_of_memory(void)362{363char errmsg[] = "out of memory -- generating core dump\n";364365(void) fprintf(stderr, "%s", errmsg);366abort();367return (0);368}369370static void371spa_config_load(void)372{373void *buf = NULL;374nvlist_t *nvlist, *child;375nvpair_t *nvpair;376char *pathname;377zfs_file_t *fp;378zfs_file_attr_t zfa;379uint64_t fsize;380int err;381382/*383* Open the configuration file.384*/385pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);386387(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);388389err = zfs_file_open(pathname, O_RDONLY, 0, &fp);390if (err)391err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);392393kmem_free(pathname, MAXPATHLEN);394395if (err)396return;397398if (zfs_file_getattr(fp, &zfa))399goto out;400401fsize = zfa.zfa_size;402buf = kmem_alloc(fsize, KM_SLEEP);403404/*405* Read the nvlist from the file.406*/407if (zfs_file_read(fp, buf, fsize, NULL) < 0)408goto out;409410/*411* Unpack the nvlist.412*/413if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)414goto out;415416/*417* Iterate over all elements in the nvlist, creating a new spa_t for418* each one with the specified configuration.419*/420spa_namespace_enter(FTAG);421nvpair = NULL;422while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {423if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)424continue;425426child = fnvpair_value_nvlist(nvpair);427428if (spa_lookup(nvpair_name(nvpair)) != NULL)429continue;430(void) spa_add(nvpair_name(nvpair), child, NULL);431}432spa_namespace_exit(FTAG);433434nvlist_free(nvlist);435436out:437if (buf != NULL)438kmem_free(buf, fsize);439440zfs_file_close(fp);441}442443void444kernel_init(int mode)445{446extern uint_t rrw_tsd_key;447448libspl_init();449450umem_nofail_callback(umem_out_of_memory);451452dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem,453(double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));454455hostid = (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0;456457system_taskq_init();458icp_init();459460zstd_init();461462spa_init((spa_mode_t)mode);463spa_config_load();464465fletcher_4_init();466467tsd_create(&rrw_tsd_key, rrw_tsd_destroy);468}469470void471kernel_fini(void)472{473fletcher_4_fini();474spa_fini();475476zstd_fini();477478icp_fini();479system_taskq_fini();480481libspl_fini();482}483484zfs_file_t *485zfs_onexit_fd_hold(int fd, minor_t *minorp)486{487(void) fd;488*minorp = 0;489return (NULL);490}491492void493zfs_onexit_fd_rele(zfs_file_t *fp)494{495(void) fp;496}497498int499zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,500uintptr_t *action_handle)501{502(void) minor, (void) func, (void) data, (void) action_handle;503return (0);504}505506void507zvol_create_minors(const char *name)508{509(void) name;510}511512void513zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)514{515(void) spa, (void) name, (void) async;516}517518void519zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,520boolean_t async)521{522(void) spa, (void) oldname, (void) newname, (void) async;523}524525/*526* Open file527*528* path - fully qualified path to file529* flags - file attributes O_READ / O_WRITE / O_EXCL530* fpp - pointer to return file pointer531*532* Returns 0 on success underlying error on failure.533*/534int535zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)536{537int fd;538int dump_fd;539int err;540int old_umask = 0;541zfs_file_t *fp;542struct stat64 st;543544if (!(flags & O_CREAT) && stat64(path, &st) == -1)545return (errno);546547if (!(flags & O_CREAT) && S_ISBLK(st.st_mode))548flags |= O_DIRECT;549550if (flags & O_CREAT)551old_umask = umask(0);552553fd = open64(path, flags, mode);554if (fd == -1)555return (errno);556557if (flags & O_CREAT)558(void) umask(old_umask);559560if (vn_dumpdir != NULL) {561char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL);562const char *inpath = zfs_basename(path);563564(void) snprintf(dumppath, MAXPATHLEN,565"%s/%s", vn_dumpdir, inpath);566dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);567umem_free(dumppath, MAXPATHLEN);568if (dump_fd == -1) {569err = errno;570close(fd);571return (err);572}573} else {574dump_fd = -1;575}576577(void) fcntl(fd, F_SETFD, FD_CLOEXEC);578579fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL);580fp->f_fd = fd;581fp->f_dump_fd = dump_fd;582*fpp = fp;583584return (0);585}586587void588zfs_file_close(zfs_file_t *fp)589{590close(fp->f_fd);591if (fp->f_dump_fd != -1)592close(fp->f_dump_fd);593594umem_free(fp, sizeof (zfs_file_t));595}596597/*598* Stateful write - use os internal file pointer to determine where to599* write and update on successful completion.600*601* fp - pointer to file (pipe, socket, etc) to write to602* buf - buffer to write603* count - # of bytes to write604* resid - pointer to count of unwritten bytes (if short write)605*606* Returns 0 on success errno on failure.607*/608int609zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)610{611ssize_t rc;612613rc = write(fp->f_fd, buf, count);614if (rc < 0)615return (errno);616617if (resid) {618*resid = count - rc;619} else if (rc != count) {620return (EIO);621}622623return (0);624}625626/*627* Stateless write - os internal file pointer is not updated.628*629* fp - pointer to file (pipe, socket, etc) to write to630* buf - buffer to write631* count - # of bytes to write632* off - file offset to write to (only valid for seekable types)633* resid - pointer to count of unwritten bytes634*635* Returns 0 on success errno on failure.636*/637int638zfs_file_pwrite(zfs_file_t *fp, const void *buf,639size_t count, loff_t pos, uint8_t ashift, ssize_t *resid)640{641ssize_t rc, split, done;642int sectors;643644/*645* To simulate partial disk writes, we split writes into two646* system calls so that the process can be killed in between.647* This is used by ztest to simulate realistic failure modes.648*/649sectors = count >> ashift;650split = (sectors > 0 ? rand() % sectors : 0) << ashift;651rc = pwrite64(fp->f_fd, buf, split, pos);652if (rc != -1) {653done = rc;654rc = pwrite64(fp->f_fd, (char *)buf + split,655count - split, pos + split);656}657#ifdef __linux__658if (rc == -1 && errno == EINVAL) {659/*660* Under Linux, this most likely means an alignment issue661* (memory or disk) due to O_DIRECT, so we abort() in order662* to catch the offender.663*/664abort();665}666#endif667668if (rc < 0)669return (errno);670671done += rc;672673if (resid) {674*resid = count - done;675} else if (done != count) {676return (EIO);677}678679return (0);680}681682/*683* Stateful read - use os internal file pointer to determine where to684* read and update on successful completion.685*686* fp - pointer to file (pipe, socket, etc) to read from687* buf - buffer to write688* count - # of bytes to read689* resid - pointer to count of unread bytes (if short read)690*691* Returns 0 on success errno on failure.692*/693int694zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)695{696int rc;697698rc = read(fp->f_fd, buf, count);699if (rc < 0)700return (errno);701702if (resid) {703*resid = count - rc;704} else if (rc != count) {705return (EIO);706}707708return (0);709}710711/*712* Stateless read - os internal file pointer is not updated.713*714* fp - pointer to file (pipe, socket, etc) to read from715* buf - buffer to write716* count - # of bytes to write717* off - file offset to read from (only valid for seekable types)718* resid - pointer to count of unwritten bytes (if short write)719*720* Returns 0 on success errno on failure.721*/722int723zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,724ssize_t *resid)725{726ssize_t rc;727728rc = pread64(fp->f_fd, buf, count, off);729if (rc < 0) {730#ifdef __linux__731/*732* Under Linux, this most likely means an alignment issue733* (memory or disk) due to O_DIRECT, so we abort() in order to734* catch the offender.735*/736if (errno == EINVAL)737abort();738#endif739return (errno);740}741742if (fp->f_dump_fd != -1) {743int status;744745status = pwrite64(fp->f_dump_fd, buf, rc, off);746ASSERT(status != -1);747}748749if (resid) {750*resid = count - rc;751} else if (rc != count) {752return (EIO);753}754755return (0);756}757758/*759* lseek - set / get file pointer760*761* fp - pointer to file (pipe, socket, etc) to read from762* offp - value to seek to, returns current value plus passed offset763* whence - see man pages for standard lseek whence values764*765* Returns 0 on success errno on failure (ESPIPE for non seekable types)766*/767int768zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)769{770loff_t rc;771772rc = lseek(fp->f_fd, *offp, whence);773if (rc < 0)774return (errno);775776*offp = rc;777778return (0);779}780781/*782* Get file attributes783*784* filp - file pointer785* zfattr - pointer to file attr structure786*787* Currently only used for fetching size and file mode788*789* Returns 0 on success or error code of underlying getattr call on failure.790*/791int792zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)793{794struct stat64 st;795796if (fstat64_blk(fp->f_fd, &st) == -1)797return (errno);798799zfattr->zfa_size = st.st_size;800zfattr->zfa_mode = st.st_mode;801802return (0);803}804805/*806* Sync file to disk807*808* filp - file pointer809* flags - O_SYNC and or O_DSYNC810*811* Returns 0 on success or error code of underlying sync call on failure.812*/813int814zfs_file_fsync(zfs_file_t *fp, int flags)815{816(void) flags;817818if (fsync(fp->f_fd) < 0)819return (errno);820821return (0);822}823824/*825* deallocate - zero and/or deallocate file storage826*827* fp - file pointer828* offset - offset to start zeroing or deallocating829* len - length to zero or deallocate830*/831int832zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)833{834int rc;835#if defined(__linux__)836rc = fallocate(fp->f_fd,837FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);838#elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029)839struct spacectl_range rqsr = {840.r_offset = offset,841.r_len = len,842};843rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr);844#else845(void) fp, (void) offset, (void) len;846rc = EOPNOTSUPP;847#endif848if (rc)849return (SET_ERROR(rc));850return (0);851}852853/*854* Request current file pointer offset855*856* fp - pointer to file857*858* Returns current file offset.859*/860loff_t861zfs_file_off(zfs_file_t *fp)862{863return (lseek(fp->f_fd, SEEK_CUR, 0));864}865866/*867* unlink file868*869* path - fully qualified file path870*871* Returns 0 on success.872*873* OPTIONAL874*/875int876zfs_file_unlink(const char *path)877{878return (remove(path));879}880881/*882* Get reference to file pointer883*884* fd - input file descriptor885*886* Returns pointer to file struct or NULL.887* Unsupported in user space.888*/889zfs_file_t *890zfs_file_get(int fd)891{892(void) fd;893abort();894return (NULL);895}896/*897* Drop reference to file pointer898*899* fp - pointer to file struct900*901* Unsupported in user space.902*/903void904zfs_file_put(zfs_file_t *fp)905{906abort();907(void) fp;908}909910void911zfsvfs_update_fromname(const char *oldname, const char *newname)912{913(void) oldname, (void) newname;914}915916void917spa_import_os(spa_t *spa)918{919(void) spa;920}921922void923spa_export_os(spa_t *spa)924{925(void) spa;926}927928void929spa_activate_os(spa_t *spa)930{931(void) spa;932}933934void935spa_deactivate_os(spa_t *spa)936{937(void) spa;938}939940941