Path: blob/main/sys/contrib/openzfs/cmd/zed/zed_disk_event.c
48380 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License Version 1.0 (CDDL-1.0).6* You can obtain a copy of the license from the top-level file7* "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.8* You may not use this file except in compliance with the license.9*10* CDDL HEADER END11*/1213/*14* Copyright (c) 2016, 2017, Intel Corporation.15*/1617#ifdef HAVE_LIBUDEV1819#include <errno.h>20#include <fcntl.h>21#include <libnvpair.h>22#include <libudev.h>23#include <libzfs.h>24#include <libzutil.h>25#include <pthread.h>26#include <stdlib.h>27#include <string.h>2829#include <sys/sysevent/eventdefs.h>30#include <sys/sysevent/dev.h>3132#include "zed_log.h"33#include "zed_disk_event.h"34#include "agents/zfs_agents.h"3536/*37* Portions of ZED need to see disk events for disks belonging to ZFS pools.38* A libudev monitor is established to monitor block device actions and pass39* them on to internal ZED logic modules. Initially, zfs_mod.c is the only40* consumer and is the Linux equivalent for the illumos syseventd ZFS SLM41* module responsible for handling disk events for ZFS.42*/4344pthread_t g_mon_tid;45struct udev *g_udev;46struct udev_monitor *g_mon;474849#define DEV_BYID_PATH "/dev/disk/by-id/"5051/* 64MB is minimum usable disk for ZFS */52#define MINIMUM_SECTORS 131072ULL535455/*56* Post disk event to SLM module57*58* occurs in the context of monitor thread59*/60static void61zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)62{63const char *strval;64uint64_t numval;6566zed_log_msg(LOG_INFO, "zed_disk_event:");67zed_log_msg(LOG_INFO, "\tclass: %s", class);68zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);69if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)70zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);71if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)72zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);73if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)74zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);75if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)76zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);77if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)78zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);79if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)80zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);81if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0)82zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval);83if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)84zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);85if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)86zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);8788(void) zfs_agent_post_event(class, subclass, nvl);89}9091/*92* dev_event_nvlist: place event schema into an nv pair list93*94* NAME VALUE (example)95* -------------- --------------------------------------------------------96* DEV_NAME /dev/sdl97* DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...98* DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC99* DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0100* DEV_IS_PART ---101* DEV_SIZE 500107862016102* ZFS_EV_POOL_GUID 17523635698032189180103* ZFS_EV_VDEV_GUID 14663607734290803088104*/105static nvlist_t *106dev_event_nvlist(struct udev_device *dev)107{108nvlist_t *nvl;109char strval[128];110const char *value, *path;111uint64_t guid;112113if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)114return (NULL);115116if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)117(void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);118if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)119(void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);120if ((path = udev_device_get_devnode(dev)) != NULL)121(void) nvlist_add_string(nvl, DEV_NAME, path);122if ((value = udev_device_get_devpath(dev)) != NULL)123(void) nvlist_add_string(nvl, DEV_PATH, value);124value = udev_device_get_devtype(dev);125if ((value != NULL && strcmp("partition", value) == 0) ||126(udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")127!= NULL)) {128(void) nvlist_add_boolean(nvl, DEV_IS_PART);129}130if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {131uint64_t numval = DEV_BSIZE;132133numval *= strtoull(value, NULL, 10);134(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);135136/*137* If the device has a parent, then get the parent block138* device's size as well. For example, /dev/sda1's parent139* is /dev/sda.140*/141struct udev_device *parent_dev = udev_device_get_parent(dev);142if (parent_dev != NULL &&143(value = udev_device_get_sysattr_value(parent_dev, "size"))144!= NULL) {145uint64_t numval = DEV_BSIZE;146147numval *= strtoull(value, NULL, 10);148(void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval);149}150}151152/*153* Grab the pool and vdev guids from blkid cache154*/155value = udev_device_get_property_value(dev, "ID_FS_UUID");156if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)157(void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);158159value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");160if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)161(void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);162163/*164* Either a vdev guid or a devid must be present for matching165*/166if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&167!nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {168nvlist_free(nvl);169return (NULL);170}171172return (nvl);173}174175/*176* Listen for block device uevents177*/178static void *179zed_udev_monitor(void *arg)180{181struct udev_monitor *mon = arg;182const char *tmp;183char *tmp2;184185zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");186187while (1) {188struct udev_device *dev;189const char *action, *type, *part, *sectors;190const char *bus, *uuid, *devpath;191const char *class, *subclass;192nvlist_t *nvl;193boolean_t is_zfs = B_FALSE;194195/* allow a cancellation while blocked (recvmsg) */196pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);197198/* blocks at recvmsg until an event occurs */199if ((dev = udev_monitor_receive_device(mon)) == NULL) {200zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "201"device error %d", errno);202continue;203}204205/* allow all steps to complete before a cancellation */206pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);207208/*209* Strongly typed device is the preferred filter210*/211type = udev_device_get_property_value(dev, "ID_FS_TYPE");212if (type != NULL && type[0] != '\0') {213if (strcmp(type, "zfs_member") == 0) {214is_zfs = B_TRUE;215} else {216/* not ours, so skip */217zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "218"%s (in use by %s)",219udev_device_get_devnode(dev), type);220udev_device_unref(dev);221continue;222}223}224225/*226* if this is a disk and it is partitioned, then the227* zfs label will reside in a DEVTYPE=partition and228* we can skip passing this event229*230* Special case: Blank disks are sometimes reported with231* an erroneous 'atari' partition, and should not be232* excluded from being used as an autoreplace disk:233*234* https://github.com/openzfs/zfs/issues/13497235*/236type = udev_device_get_property_value(dev, "DEVTYPE");237part = udev_device_get_property_value(dev,238"ID_PART_TABLE_TYPE");239if (type != NULL && type[0] != '\0' &&240strcmp(type, "disk") == 0 &&241part != NULL && part[0] != '\0') {242const char *devname =243udev_device_get_property_value(dev, "DEVNAME");244245if (strcmp(part, "atari") == 0) {246zed_log_msg(LOG_INFO,247"%s: %s is reporting an atari partition, "248"but we're going to assume it's a false "249"positive and still use it (issue #13497)",250__func__, devname);251} else {252zed_log_msg(LOG_INFO,253"%s: skip %s since it has a %s partition "254"already", __func__, devname, part);255/* skip and wait for partition event */256udev_device_unref(dev);257continue;258}259}260261/*262* ignore small partitions263*/264sectors = udev_device_get_property_value(dev,265"ID_PART_ENTRY_SIZE");266if (sectors == NULL)267sectors = udev_device_get_sysattr_value(dev, "size");268if (sectors != NULL &&269strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {270zed_log_msg(LOG_INFO,271"%s: %s sectors %s < %llu (minimum)",272__func__,273udev_device_get_property_value(dev, "DEVNAME"),274sectors, MINIMUM_SECTORS);275udev_device_unref(dev);276continue;277}278279/*280* If the blkid probe didn't find ZFS, then a persistent281* device id string is required in the message schema282* for matching with vdevs. Preflight here for expected283* udev information.284*285* Special case:286* NVMe devices don't have ID_BUS set (at least on RHEL 7-8),287* but they are valid for autoreplace. Add a special case for288* them by searching for "/nvme/" in the udev DEVPATH:289*290* DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1291*/292bus = udev_device_get_property_value(dev, "ID_BUS");293uuid = udev_device_get_property_value(dev, "DM_UUID");294devpath = udev_device_get_devpath(dev);295if (!is_zfs && (bus == NULL && uuid == NULL &&296strstr(devpath, "/nvme/") == NULL)) {297zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "298"source", udev_device_get_devnode(dev));299udev_device_unref(dev);300continue;301}302303action = udev_device_get_action(dev);304if (strcmp(action, "add") == 0) {305class = EC_DEV_ADD;306subclass = ESC_DISK;307} else if (strcmp(action, "remove") == 0) {308class = EC_DEV_REMOVE;309subclass = ESC_DISK;310} else if (strcmp(action, "change") == 0) {311class = EC_DEV_STATUS;312subclass = ESC_DEV_DLE;313} else {314zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",315action);316udev_device_unref(dev);317continue;318}319320/*321* Special case an EC_DEV_ADD for multipath devices322*323* When a multipath device is created, udev reports the324* following:325*326* 1. "add" event of the dm device for the multipath device327* (like /dev/dm-3).328* 2. "change" event to create the actual multipath device329* symlink (like /dev/mapper/mpatha). The event also330* passes back the relevant DM vars we care about, like331* DM_UUID.332* 3. Another "change" event identical to #2 (that we ignore).333*334* To get the behavior we want, we treat the "change" event335* in #2 as a "add" event; as if "/dev/mapper/mpatha" was336* a new disk being added.337*/338if (strcmp(class, EC_DEV_STATUS) == 0 &&339udev_device_get_property_value(dev, "DM_UUID") &&340udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {341tmp = udev_device_get_devnode(dev);342tmp2 = zfs_get_underlying_path(tmp);343if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {344/*345* We have a real underlying device, which346* means that this multipath "change" event is347* an "add" event.348*349* If the multipath device and the underlying350* dev are the same name (i.e. /dev/dm-5), then351* there is no real underlying disk for this352* multipath device, and so this "change" event353* really is a multipath removal.354*/355class = EC_DEV_ADD;356subclass = ESC_DISK;357} else {358tmp = udev_device_get_property_value(dev,359"DM_NR_VALID_PATHS");360/* treat as a multipath remove */361if (tmp != NULL && strcmp(tmp, "0") == 0) {362class = EC_DEV_REMOVE;363subclass = ESC_DISK;364}365}366free(tmp2);367}368369/*370* Special case an EC_DEV_ADD for scsi_debug devices371*372* These devices require a udevadm trigger command after373* creation in order to register the vdev_id scsidebug alias374* rule (adds a persistent path (phys_path) used for fault375* management automated tests in the ZFS test suite.376*377* After udevadm trigger command, event registers as a "change"378* event but needs to instead be handled as another "add" event379* to allow for disk labeling and partitioning to occur.380*/381if (strcmp(class, EC_DEV_STATUS) == 0 &&382udev_device_get_property_value(dev, "ID_VDEV") &&383udev_device_get_property_value(dev, "ID_MODEL")) {384const char *id_model, *id_model_sd = "scsi_debug";385386id_model = udev_device_get_property_value(dev,387"ID_MODEL");388if (strcmp(id_model, id_model_sd) == 0) {389class = EC_DEV_ADD;390subclass = ESC_DISK;391}392}393394if ((nvl = dev_event_nvlist(dev)) != NULL) {395zed_udev_event(class, subclass, nvl);396nvlist_free(nvl);397}398399udev_device_unref(dev);400}401402return (NULL);403}404405int406zed_disk_event_init(void)407{408int fd, fflags;409410if ((g_udev = udev_new()) == NULL) {411zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);412return (-1);413}414415/* Set up a udev monitor for block devices */416g_mon = udev_monitor_new_from_netlink(g_udev, "udev");417udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");418udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",419"partition");420udev_monitor_enable_receiving(g_mon);421422/* Make sure monitoring socket is blocking */423fd = udev_monitor_get_fd(g_mon);424if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)425(void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);426427/* spawn a thread to monitor events */428if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {429udev_monitor_unref(g_mon);430udev_unref(g_udev);431zed_log_msg(LOG_WARNING, "pthread_create failed");432return (-1);433}434435pthread_setname_np(g_mon_tid, "udev monitor");436zed_log_msg(LOG_INFO, "zed_disk_event_init");437438return (0);439}440441void442zed_disk_event_fini(void)443{444/* cancel monitor thread at recvmsg() */445(void) pthread_cancel(g_mon_tid);446(void) pthread_join(g_mon_tid, NULL);447448/* cleanup udev resources */449udev_monitor_unref(g_mon);450udev_unref(g_udev);451452zed_log_msg(LOG_INFO, "zed_disk_event_fini");453}454455#else456457#include "zed_disk_event.h"458459int460zed_disk_event_init(void)461{462return (0);463}464465void466zed_disk_event_fini(void)467{468}469470#endif /* HAVE_LIBUDEV */471472473