Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/cmd/zed/zed_disk_event.c
48380 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License Version 1.0 (CDDL-1.0).
7
* You can obtain a copy of the license from the top-level file
8
* "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
9
* You may not use this file except in compliance with the license.
10
*
11
* CDDL HEADER END
12
*/
13
14
/*
15
* Copyright (c) 2016, 2017, Intel Corporation.
16
*/
17
18
#ifdef HAVE_LIBUDEV
19
20
#include <errno.h>
21
#include <fcntl.h>
22
#include <libnvpair.h>
23
#include <libudev.h>
24
#include <libzfs.h>
25
#include <libzutil.h>
26
#include <pthread.h>
27
#include <stdlib.h>
28
#include <string.h>
29
30
#include <sys/sysevent/eventdefs.h>
31
#include <sys/sysevent/dev.h>
32
33
#include "zed_log.h"
34
#include "zed_disk_event.h"
35
#include "agents/zfs_agents.h"
36
37
/*
38
* Portions of ZED need to see disk events for disks belonging to ZFS pools.
39
* A libudev monitor is established to monitor block device actions and pass
40
* them on to internal ZED logic modules. Initially, zfs_mod.c is the only
41
* consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
42
* module responsible for handling disk events for ZFS.
43
*/
44
45
pthread_t g_mon_tid;
46
struct udev *g_udev;
47
struct udev_monitor *g_mon;
48
49
50
#define DEV_BYID_PATH "/dev/disk/by-id/"
51
52
/* 64MB is minimum usable disk for ZFS */
53
#define MINIMUM_SECTORS 131072ULL
54
55
56
/*
57
* Post disk event to SLM module
58
*
59
* occurs in the context of monitor thread
60
*/
61
static void
62
zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
63
{
64
const char *strval;
65
uint64_t numval;
66
67
zed_log_msg(LOG_INFO, "zed_disk_event:");
68
zed_log_msg(LOG_INFO, "\tclass: %s", class);
69
zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
70
if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
71
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
72
if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
73
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
74
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
75
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
76
if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
77
zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
78
if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
79
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
80
if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
81
zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
82
if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0)
83
zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval);
84
if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
85
zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
86
if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
87
zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
88
89
(void) zfs_agent_post_event(class, subclass, nvl);
90
}
91
92
/*
93
* dev_event_nvlist: place event schema into an nv pair list
94
*
95
* NAME VALUE (example)
96
* -------------- --------------------------------------------------------
97
* DEV_NAME /dev/sdl
98
* DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
99
* DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
100
* DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0
101
* DEV_IS_PART ---
102
* DEV_SIZE 500107862016
103
* ZFS_EV_POOL_GUID 17523635698032189180
104
* ZFS_EV_VDEV_GUID 14663607734290803088
105
*/
106
static nvlist_t *
107
dev_event_nvlist(struct udev_device *dev)
108
{
109
nvlist_t *nvl;
110
char strval[128];
111
const char *value, *path;
112
uint64_t guid;
113
114
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
115
return (NULL);
116
117
if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
118
(void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
119
if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
120
(void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
121
if ((path = udev_device_get_devnode(dev)) != NULL)
122
(void) nvlist_add_string(nvl, DEV_NAME, path);
123
if ((value = udev_device_get_devpath(dev)) != NULL)
124
(void) nvlist_add_string(nvl, DEV_PATH, value);
125
value = udev_device_get_devtype(dev);
126
if ((value != NULL && strcmp("partition", value) == 0) ||
127
(udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
128
!= NULL)) {
129
(void) nvlist_add_boolean(nvl, DEV_IS_PART);
130
}
131
if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
132
uint64_t numval = DEV_BSIZE;
133
134
numval *= strtoull(value, NULL, 10);
135
(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
136
137
/*
138
* If the device has a parent, then get the parent block
139
* device's size as well. For example, /dev/sda1's parent
140
* is /dev/sda.
141
*/
142
struct udev_device *parent_dev = udev_device_get_parent(dev);
143
if (parent_dev != NULL &&
144
(value = udev_device_get_sysattr_value(parent_dev, "size"))
145
!= NULL) {
146
uint64_t numval = DEV_BSIZE;
147
148
numval *= strtoull(value, NULL, 10);
149
(void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval);
150
}
151
}
152
153
/*
154
* Grab the pool and vdev guids from blkid cache
155
*/
156
value = udev_device_get_property_value(dev, "ID_FS_UUID");
157
if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
158
(void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
159
160
value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
161
if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
162
(void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
163
164
/*
165
* Either a vdev guid or a devid must be present for matching
166
*/
167
if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
168
!nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
169
nvlist_free(nvl);
170
return (NULL);
171
}
172
173
return (nvl);
174
}
175
176
/*
177
* Listen for block device uevents
178
*/
179
static void *
180
zed_udev_monitor(void *arg)
181
{
182
struct udev_monitor *mon = arg;
183
const char *tmp;
184
char *tmp2;
185
186
zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
187
188
while (1) {
189
struct udev_device *dev;
190
const char *action, *type, *part, *sectors;
191
const char *bus, *uuid, *devpath;
192
const char *class, *subclass;
193
nvlist_t *nvl;
194
boolean_t is_zfs = B_FALSE;
195
196
/* allow a cancellation while blocked (recvmsg) */
197
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
198
199
/* blocks at recvmsg until an event occurs */
200
if ((dev = udev_monitor_receive_device(mon)) == NULL) {
201
zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
202
"device error %d", errno);
203
continue;
204
}
205
206
/* allow all steps to complete before a cancellation */
207
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
208
209
/*
210
* Strongly typed device is the preferred filter
211
*/
212
type = udev_device_get_property_value(dev, "ID_FS_TYPE");
213
if (type != NULL && type[0] != '\0') {
214
if (strcmp(type, "zfs_member") == 0) {
215
is_zfs = B_TRUE;
216
} else {
217
/* not ours, so skip */
218
zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
219
"%s (in use by %s)",
220
udev_device_get_devnode(dev), type);
221
udev_device_unref(dev);
222
continue;
223
}
224
}
225
226
/*
227
* if this is a disk and it is partitioned, then the
228
* zfs label will reside in a DEVTYPE=partition and
229
* we can skip passing this event
230
*
231
* Special case: Blank disks are sometimes reported with
232
* an erroneous 'atari' partition, and should not be
233
* excluded from being used as an autoreplace disk:
234
*
235
* https://github.com/openzfs/zfs/issues/13497
236
*/
237
type = udev_device_get_property_value(dev, "DEVTYPE");
238
part = udev_device_get_property_value(dev,
239
"ID_PART_TABLE_TYPE");
240
if (type != NULL && type[0] != '\0' &&
241
strcmp(type, "disk") == 0 &&
242
part != NULL && part[0] != '\0') {
243
const char *devname =
244
udev_device_get_property_value(dev, "DEVNAME");
245
246
if (strcmp(part, "atari") == 0) {
247
zed_log_msg(LOG_INFO,
248
"%s: %s is reporting an atari partition, "
249
"but we're going to assume it's a false "
250
"positive and still use it (issue #13497)",
251
__func__, devname);
252
} else {
253
zed_log_msg(LOG_INFO,
254
"%s: skip %s since it has a %s partition "
255
"already", __func__, devname, part);
256
/* skip and wait for partition event */
257
udev_device_unref(dev);
258
continue;
259
}
260
}
261
262
/*
263
* ignore small partitions
264
*/
265
sectors = udev_device_get_property_value(dev,
266
"ID_PART_ENTRY_SIZE");
267
if (sectors == NULL)
268
sectors = udev_device_get_sysattr_value(dev, "size");
269
if (sectors != NULL &&
270
strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
271
zed_log_msg(LOG_INFO,
272
"%s: %s sectors %s < %llu (minimum)",
273
__func__,
274
udev_device_get_property_value(dev, "DEVNAME"),
275
sectors, MINIMUM_SECTORS);
276
udev_device_unref(dev);
277
continue;
278
}
279
280
/*
281
* If the blkid probe didn't find ZFS, then a persistent
282
* device id string is required in the message schema
283
* for matching with vdevs. Preflight here for expected
284
* udev information.
285
*
286
* Special case:
287
* NVMe devices don't have ID_BUS set (at least on RHEL 7-8),
288
* but they are valid for autoreplace. Add a special case for
289
* them by searching for "/nvme/" in the udev DEVPATH:
290
*
291
* DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1
292
*/
293
bus = udev_device_get_property_value(dev, "ID_BUS");
294
uuid = udev_device_get_property_value(dev, "DM_UUID");
295
devpath = udev_device_get_devpath(dev);
296
if (!is_zfs && (bus == NULL && uuid == NULL &&
297
strstr(devpath, "/nvme/") == NULL)) {
298
zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
299
"source", udev_device_get_devnode(dev));
300
udev_device_unref(dev);
301
continue;
302
}
303
304
action = udev_device_get_action(dev);
305
if (strcmp(action, "add") == 0) {
306
class = EC_DEV_ADD;
307
subclass = ESC_DISK;
308
} else if (strcmp(action, "remove") == 0) {
309
class = EC_DEV_REMOVE;
310
subclass = ESC_DISK;
311
} else if (strcmp(action, "change") == 0) {
312
class = EC_DEV_STATUS;
313
subclass = ESC_DEV_DLE;
314
} else {
315
zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
316
action);
317
udev_device_unref(dev);
318
continue;
319
}
320
321
/*
322
* Special case an EC_DEV_ADD for multipath devices
323
*
324
* When a multipath device is created, udev reports the
325
* following:
326
*
327
* 1. "add" event of the dm device for the multipath device
328
* (like /dev/dm-3).
329
* 2. "change" event to create the actual multipath device
330
* symlink (like /dev/mapper/mpatha). The event also
331
* passes back the relevant DM vars we care about, like
332
* DM_UUID.
333
* 3. Another "change" event identical to #2 (that we ignore).
334
*
335
* To get the behavior we want, we treat the "change" event
336
* in #2 as a "add" event; as if "/dev/mapper/mpatha" was
337
* a new disk being added.
338
*/
339
if (strcmp(class, EC_DEV_STATUS) == 0 &&
340
udev_device_get_property_value(dev, "DM_UUID") &&
341
udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
342
tmp = udev_device_get_devnode(dev);
343
tmp2 = zfs_get_underlying_path(tmp);
344
if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
345
/*
346
* We have a real underlying device, which
347
* means that this multipath "change" event is
348
* an "add" event.
349
*
350
* If the multipath device and the underlying
351
* dev are the same name (i.e. /dev/dm-5), then
352
* there is no real underlying disk for this
353
* multipath device, and so this "change" event
354
* really is a multipath removal.
355
*/
356
class = EC_DEV_ADD;
357
subclass = ESC_DISK;
358
} else {
359
tmp = udev_device_get_property_value(dev,
360
"DM_NR_VALID_PATHS");
361
/* treat as a multipath remove */
362
if (tmp != NULL && strcmp(tmp, "0") == 0) {
363
class = EC_DEV_REMOVE;
364
subclass = ESC_DISK;
365
}
366
}
367
free(tmp2);
368
}
369
370
/*
371
* Special case an EC_DEV_ADD for scsi_debug devices
372
*
373
* These devices require a udevadm trigger command after
374
* creation in order to register the vdev_id scsidebug alias
375
* rule (adds a persistent path (phys_path) used for fault
376
* management automated tests in the ZFS test suite.
377
*
378
* After udevadm trigger command, event registers as a "change"
379
* event but needs to instead be handled as another "add" event
380
* to allow for disk labeling and partitioning to occur.
381
*/
382
if (strcmp(class, EC_DEV_STATUS) == 0 &&
383
udev_device_get_property_value(dev, "ID_VDEV") &&
384
udev_device_get_property_value(dev, "ID_MODEL")) {
385
const char *id_model, *id_model_sd = "scsi_debug";
386
387
id_model = udev_device_get_property_value(dev,
388
"ID_MODEL");
389
if (strcmp(id_model, id_model_sd) == 0) {
390
class = EC_DEV_ADD;
391
subclass = ESC_DISK;
392
}
393
}
394
395
if ((nvl = dev_event_nvlist(dev)) != NULL) {
396
zed_udev_event(class, subclass, nvl);
397
nvlist_free(nvl);
398
}
399
400
udev_device_unref(dev);
401
}
402
403
return (NULL);
404
}
405
406
int
407
zed_disk_event_init(void)
408
{
409
int fd, fflags;
410
411
if ((g_udev = udev_new()) == NULL) {
412
zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
413
return (-1);
414
}
415
416
/* Set up a udev monitor for block devices */
417
g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
418
udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
419
udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
420
"partition");
421
udev_monitor_enable_receiving(g_mon);
422
423
/* Make sure monitoring socket is blocking */
424
fd = udev_monitor_get_fd(g_mon);
425
if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
426
(void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
427
428
/* spawn a thread to monitor events */
429
if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
430
udev_monitor_unref(g_mon);
431
udev_unref(g_udev);
432
zed_log_msg(LOG_WARNING, "pthread_create failed");
433
return (-1);
434
}
435
436
pthread_setname_np(g_mon_tid, "udev monitor");
437
zed_log_msg(LOG_INFO, "zed_disk_event_init");
438
439
return (0);
440
}
441
442
void
443
zed_disk_event_fini(void)
444
{
445
/* cancel monitor thread at recvmsg() */
446
(void) pthread_cancel(g_mon_tid);
447
(void) pthread_join(g_mon_tid, NULL);
448
449
/* cleanup udev resources */
450
udev_monitor_unref(g_mon);
451
udev_unref(g_udev);
452
453
zed_log_msg(LOG_INFO, "zed_disk_event_fini");
454
}
455
456
#else
457
458
#include "zed_disk_event.h"
459
460
int
461
zed_disk_event_init(void)
462
{
463
return (0);
464
}
465
466
void
467
zed_disk_event_fini(void)
468
{
469
}
470
471
#endif /* HAVE_LIBUDEV */
472
473