Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/block/ublk_drv.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* Userspace block device - block device which IO is handled from userspace
4
*
5
* Take full use of io_uring passthrough command for communicating with
6
* ublk userspace daemon(ublksrvd) for handling basic IO request.
7
*
8
* Copyright 2022 Ming Lei <[email protected]>
9
*
10
* (part of code stolen from loop.c)
11
*/
12
#include <linux/module.h>
13
#include <linux/moduleparam.h>
14
#include <linux/sched.h>
15
#include <linux/fs.h>
16
#include <linux/pagemap.h>
17
#include <linux/file.h>
18
#include <linux/stat.h>
19
#include <linux/errno.h>
20
#include <linux/major.h>
21
#include <linux/wait.h>
22
#include <linux/blkdev.h>
23
#include <linux/init.h>
24
#include <linux/swap.h>
25
#include <linux/slab.h>
26
#include <linux/compat.h>
27
#include <linux/mutex.h>
28
#include <linux/writeback.h>
29
#include <linux/completion.h>
30
#include <linux/highmem.h>
31
#include <linux/sysfs.h>
32
#include <linux/miscdevice.h>
33
#include <linux/falloc.h>
34
#include <linux/uio.h>
35
#include <linux/ioprio.h>
36
#include <linux/sched/mm.h>
37
#include <linux/uaccess.h>
38
#include <linux/cdev.h>
39
#include <linux/io_uring/cmd.h>
40
#include <linux/blk-mq.h>
41
#include <linux/delay.h>
42
#include <linux/mm.h>
43
#include <asm/page.h>
44
#include <linux/task_work.h>
45
#include <linux/namei.h>
46
#include <linux/kref.h>
47
#include <uapi/linux/ublk_cmd.h>
48
49
#define UBLK_MINORS (1U << MINORBITS)
50
51
#define UBLK_INVALID_BUF_IDX ((u16)-1)
52
53
/* private ioctl command mirror */
54
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
55
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
56
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
57
58
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
59
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
60
61
/* All UBLK_F_* have to be included into UBLK_F_ALL */
62
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
63
| UBLK_F_URING_CMD_COMP_IN_TASK \
64
| UBLK_F_NEED_GET_DATA \
65
| UBLK_F_USER_RECOVERY \
66
| UBLK_F_USER_RECOVERY_REISSUE \
67
| UBLK_F_UNPRIVILEGED_DEV \
68
| UBLK_F_CMD_IOCTL_ENCODE \
69
| UBLK_F_USER_COPY \
70
| UBLK_F_ZONED \
71
| UBLK_F_USER_RECOVERY_FAIL_IO \
72
| UBLK_F_UPDATE_SIZE \
73
| UBLK_F_AUTO_BUF_REG \
74
| UBLK_F_QUIESCE \
75
| UBLK_F_PER_IO_DAEMON \
76
| UBLK_F_BUF_REG_OFF_DAEMON)
77
78
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
79
| UBLK_F_USER_RECOVERY_REISSUE \
80
| UBLK_F_USER_RECOVERY_FAIL_IO)
81
82
/* All UBLK_PARAM_TYPE_* should be included here */
83
#define UBLK_PARAM_TYPE_ALL \
84
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
85
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
86
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
87
88
struct ublk_uring_cmd_pdu {
89
/*
90
* Store requests in same batch temporarily for queuing them to
91
* daemon context.
92
*
93
* It should have been stored to request payload, but we do want
94
* to avoid extra pre-allocation, and uring_cmd payload is always
95
* free for us
96
*/
97
union {
98
struct request *req;
99
struct request *req_list;
100
};
101
102
/*
103
* The following two are valid in this cmd whole lifetime, and
104
* setup in ublk uring_cmd handler
105
*/
106
struct ublk_queue *ubq;
107
108
u16 tag;
109
};
110
111
/*
112
* io command is active: sqe cmd is received, and its cqe isn't done
113
*
114
* If the flag is set, the io command is owned by ublk driver, and waited
115
* for incoming blk-mq request from the ublk block device.
116
*
117
* If the flag is cleared, the io command will be completed, and owned by
118
* ublk server.
119
*/
120
#define UBLK_IO_FLAG_ACTIVE 0x01
121
122
/*
123
* IO command is completed via cqe, and it is being handled by ublksrv, and
124
* not committed yet
125
*
126
* Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
127
* cross verification
128
*/
129
#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
130
131
/*
132
* UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
133
* get data buffer address from ublksrv.
134
*
135
* Then, bio data could be copied into this data buffer for a WRITE request
136
* after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
137
*/
138
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
139
140
/*
141
* request buffer is registered automatically, so we have to unregister it
142
* before completing this request.
143
*
144
* io_uring will unregister buffer automatically for us during exiting.
145
*/
146
#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
147
148
/* atomic RW with ubq->cancel_lock */
149
#define UBLK_IO_FLAG_CANCELED 0x80000000
150
151
/*
152
* Initialize refcount to a large number to include any registered buffers.
153
* UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
154
* any buffers registered on the io daemon task.
155
*/
156
#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
157
158
struct ublk_io {
159
/* userspace buffer address from io cmd */
160
union {
161
__u64 addr;
162
struct ublk_auto_buf_reg buf;
163
};
164
unsigned int flags;
165
int res;
166
167
union {
168
/* valid if UBLK_IO_FLAG_ACTIVE is set */
169
struct io_uring_cmd *cmd;
170
/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
171
struct request *req;
172
};
173
174
struct task_struct *task;
175
176
/*
177
* The number of uses of this I/O by the ublk server
178
* if user copy or zero copy are enabled:
179
* - UBLK_REFCOUNT_INIT from dispatch to the server
180
* until UBLK_IO_COMMIT_AND_FETCH_REQ
181
* - 1 for each inflight ublk_ch_{read,write}_iter() call
182
* - 1 for each io_uring registered buffer not registered on task
183
* The I/O can only be completed once all references are dropped.
184
* User copy and buffer registration operations are only permitted
185
* if the reference count is nonzero.
186
*/
187
refcount_t ref;
188
/* Count of buffers registered on task and not yet unregistered */
189
unsigned task_registered_buffers;
190
191
void *buf_ctx_handle;
192
} ____cacheline_aligned_in_smp;
193
194
struct ublk_queue {
195
int q_id;
196
int q_depth;
197
198
unsigned long flags;
199
struct ublksrv_io_desc *io_cmd_buf;
200
201
bool force_abort;
202
bool canceling;
203
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
204
unsigned short nr_io_ready; /* how many ios setup */
205
spinlock_t cancel_lock;
206
struct ublk_device *dev;
207
struct ublk_io ios[];
208
};
209
210
struct ublk_device {
211
struct gendisk *ub_disk;
212
213
char *__queues;
214
215
unsigned int queue_size;
216
struct ublksrv_ctrl_dev_info dev_info;
217
218
struct blk_mq_tag_set tag_set;
219
220
struct cdev cdev;
221
struct device cdev_dev;
222
223
#define UB_STATE_OPEN 0
224
#define UB_STATE_USED 1
225
#define UB_STATE_DELETED 2
226
unsigned long state;
227
int ub_number;
228
229
struct mutex mutex;
230
231
spinlock_t lock;
232
struct mm_struct *mm;
233
234
struct ublk_params params;
235
236
struct completion completion;
237
unsigned int nr_queues_ready;
238
bool unprivileged_daemons;
239
struct mutex cancel_mutex;
240
bool canceling;
241
pid_t ublksrv_tgid;
242
struct delayed_work exit_work;
243
};
244
245
/* header of ublk_params */
246
struct ublk_params_header {
247
__u32 len;
248
__u32 types;
249
};
250
251
static void ublk_io_release(void *priv);
252
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
253
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
254
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
255
const struct ublk_queue *ubq, struct ublk_io *io,
256
size_t offset);
257
static inline unsigned int ublk_req_build_flags(struct request *req);
258
259
static inline struct ublksrv_io_desc *
260
ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
261
{
262
return &ubq->io_cmd_buf[tag];
263
}
264
265
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
266
{
267
return ub->dev_info.flags & UBLK_F_ZONED;
268
}
269
270
static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
271
{
272
return ubq->flags & UBLK_F_ZONED;
273
}
274
275
#ifdef CONFIG_BLK_DEV_ZONED
276
277
struct ublk_zoned_report_desc {
278
__u64 sector;
279
__u32 operation;
280
__u32 nr_zones;
281
};
282
283
static DEFINE_XARRAY(ublk_zoned_report_descs);
284
285
static int ublk_zoned_insert_report_desc(const struct request *req,
286
struct ublk_zoned_report_desc *desc)
287
{
288
return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
289
desc, GFP_KERNEL);
290
}
291
292
static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
293
const struct request *req)
294
{
295
return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
296
}
297
298
static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
299
const struct request *req)
300
{
301
return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
302
}
303
304
static int ublk_get_nr_zones(const struct ublk_device *ub)
305
{
306
const struct ublk_param_basic *p = &ub->params.basic;
307
308
/* Zone size is a power of 2 */
309
return p->dev_sectors >> ilog2(p->chunk_sectors);
310
}
311
312
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
313
{
314
return blk_revalidate_disk_zones(ub->ub_disk);
315
}
316
317
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
318
{
319
const struct ublk_param_zoned *p = &ub->params.zoned;
320
int nr_zones;
321
322
if (!ublk_dev_is_zoned(ub))
323
return -EINVAL;
324
325
if (!p->max_zone_append_sectors)
326
return -EINVAL;
327
328
nr_zones = ublk_get_nr_zones(ub);
329
330
if (p->max_active_zones > nr_zones)
331
return -EINVAL;
332
333
if (p->max_open_zones > nr_zones)
334
return -EINVAL;
335
336
return 0;
337
}
338
339
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
340
{
341
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
342
}
343
344
/* Based on virtblk_alloc_report_buffer */
345
static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
346
unsigned int nr_zones, size_t *buflen)
347
{
348
struct request_queue *q = ublk->ub_disk->queue;
349
size_t bufsize;
350
void *buf;
351
352
nr_zones = min_t(unsigned int, nr_zones,
353
ublk->ub_disk->nr_zones);
354
355
bufsize = nr_zones * sizeof(struct blk_zone);
356
bufsize =
357
min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
358
359
while (bufsize >= sizeof(struct blk_zone)) {
360
buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
361
if (buf) {
362
*buflen = bufsize;
363
return buf;
364
}
365
bufsize >>= 1;
366
}
367
368
*buflen = 0;
369
return NULL;
370
}
371
372
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
373
unsigned int nr_zones, report_zones_cb cb, void *data)
374
{
375
struct ublk_device *ub = disk->private_data;
376
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
377
unsigned int first_zone = sector >> ilog2(zone_size_sectors);
378
unsigned int done_zones = 0;
379
unsigned int max_zones_per_request;
380
int ret;
381
struct blk_zone *buffer;
382
size_t buffer_length;
383
384
nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
385
nr_zones);
386
387
buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
388
if (!buffer)
389
return -ENOMEM;
390
391
max_zones_per_request = buffer_length / sizeof(struct blk_zone);
392
393
while (done_zones < nr_zones) {
394
unsigned int remaining_zones = nr_zones - done_zones;
395
unsigned int zones_in_request =
396
min_t(unsigned int, remaining_zones, max_zones_per_request);
397
struct request *req;
398
struct ublk_zoned_report_desc desc;
399
blk_status_t status;
400
401
memset(buffer, 0, buffer_length);
402
403
req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
404
if (IS_ERR(req)) {
405
ret = PTR_ERR(req);
406
goto out;
407
}
408
409
desc.operation = UBLK_IO_OP_REPORT_ZONES;
410
desc.sector = sector;
411
desc.nr_zones = zones_in_request;
412
ret = ublk_zoned_insert_report_desc(req, &desc);
413
if (ret)
414
goto free_req;
415
416
ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
417
if (ret)
418
goto erase_desc;
419
420
status = blk_execute_rq(req, 0);
421
ret = blk_status_to_errno(status);
422
erase_desc:
423
ublk_zoned_erase_report_desc(req);
424
free_req:
425
blk_mq_free_request(req);
426
if (ret)
427
goto out;
428
429
for (unsigned int i = 0; i < zones_in_request; i++) {
430
struct blk_zone *zone = buffer + i;
431
432
/* A zero length zone means no more zones in this response */
433
if (!zone->len)
434
break;
435
436
ret = cb(zone, i, data);
437
if (ret)
438
goto out;
439
440
done_zones++;
441
sector += zone_size_sectors;
442
443
}
444
}
445
446
ret = done_zones;
447
448
out:
449
kvfree(buffer);
450
return ret;
451
}
452
453
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
454
struct request *req)
455
{
456
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
457
struct ublk_io *io = &ubq->ios[req->tag];
458
struct ublk_zoned_report_desc *desc;
459
u32 ublk_op;
460
461
switch (req_op(req)) {
462
case REQ_OP_ZONE_OPEN:
463
ublk_op = UBLK_IO_OP_ZONE_OPEN;
464
break;
465
case REQ_OP_ZONE_CLOSE:
466
ublk_op = UBLK_IO_OP_ZONE_CLOSE;
467
break;
468
case REQ_OP_ZONE_FINISH:
469
ublk_op = UBLK_IO_OP_ZONE_FINISH;
470
break;
471
case REQ_OP_ZONE_RESET:
472
ublk_op = UBLK_IO_OP_ZONE_RESET;
473
break;
474
case REQ_OP_ZONE_APPEND:
475
ublk_op = UBLK_IO_OP_ZONE_APPEND;
476
break;
477
case REQ_OP_ZONE_RESET_ALL:
478
ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
479
break;
480
case REQ_OP_DRV_IN:
481
desc = ublk_zoned_get_report_desc(req);
482
if (!desc)
483
return BLK_STS_IOERR;
484
ublk_op = desc->operation;
485
switch (ublk_op) {
486
case UBLK_IO_OP_REPORT_ZONES:
487
iod->op_flags = ublk_op | ublk_req_build_flags(req);
488
iod->nr_zones = desc->nr_zones;
489
iod->start_sector = desc->sector;
490
return BLK_STS_OK;
491
default:
492
return BLK_STS_IOERR;
493
}
494
case REQ_OP_DRV_OUT:
495
/* We do not support drv_out */
496
return BLK_STS_NOTSUPP;
497
default:
498
return BLK_STS_IOERR;
499
}
500
501
iod->op_flags = ublk_op | ublk_req_build_flags(req);
502
iod->nr_sectors = blk_rq_sectors(req);
503
iod->start_sector = blk_rq_pos(req);
504
iod->addr = io->addr;
505
506
return BLK_STS_OK;
507
}
508
509
#else
510
511
#define ublk_report_zones (NULL)
512
513
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
514
{
515
return -EOPNOTSUPP;
516
}
517
518
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
519
{
520
}
521
522
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
523
{
524
return 0;
525
}
526
527
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
528
struct request *req)
529
{
530
return BLK_STS_NOTSUPP;
531
}
532
533
#endif
534
535
static inline void __ublk_complete_rq(struct request *req);
536
537
static dev_t ublk_chr_devt;
538
static const struct class ublk_chr_class = {
539
.name = "ublk-char",
540
};
541
542
static DEFINE_IDR(ublk_index_idr);
543
static DEFINE_SPINLOCK(ublk_idr_lock);
544
static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
545
546
static DEFINE_MUTEX(ublk_ctl_mutex);
547
548
549
#define UBLK_MAX_UBLKS UBLK_MINORS
550
551
/*
552
* Max unprivileged ublk devices allowed to add
553
*
554
* It can be extended to one per-user limit in future or even controlled
555
* by cgroup.
556
*/
557
static unsigned int unprivileged_ublks_max = 64;
558
static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
559
560
static struct miscdevice ublk_misc;
561
562
static inline unsigned ublk_pos_to_hwq(loff_t pos)
563
{
564
return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
565
UBLK_QID_BITS_MASK;
566
}
567
568
static inline unsigned ublk_pos_to_buf_off(loff_t pos)
569
{
570
return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
571
}
572
573
static inline unsigned ublk_pos_to_tag(loff_t pos)
574
{
575
return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
576
UBLK_TAG_BITS_MASK;
577
}
578
579
static void ublk_dev_param_basic_apply(struct ublk_device *ub)
580
{
581
const struct ublk_param_basic *p = &ub->params.basic;
582
583
if (p->attrs & UBLK_ATTR_READ_ONLY)
584
set_disk_ro(ub->ub_disk, true);
585
586
set_capacity(ub->ub_disk, p->dev_sectors);
587
}
588
589
static int ublk_validate_params(const struct ublk_device *ub)
590
{
591
/* basic param is the only one which must be set */
592
if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
593
const struct ublk_param_basic *p = &ub->params.basic;
594
595
if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
596
return -EINVAL;
597
598
if (p->logical_bs_shift > p->physical_bs_shift)
599
return -EINVAL;
600
601
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
602
return -EINVAL;
603
604
if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
605
return -EINVAL;
606
} else
607
return -EINVAL;
608
609
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
610
const struct ublk_param_discard *p = &ub->params.discard;
611
612
/* So far, only support single segment discard */
613
if (p->max_discard_sectors && p->max_discard_segments != 1)
614
return -EINVAL;
615
616
if (!p->discard_granularity)
617
return -EINVAL;
618
}
619
620
/* dev_t is read-only */
621
if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
622
return -EINVAL;
623
624
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
625
return ublk_dev_param_zoned_validate(ub);
626
else if (ublk_dev_is_zoned(ub))
627
return -EINVAL;
628
629
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
630
const struct ublk_param_dma_align *p = &ub->params.dma;
631
632
if (p->alignment >= PAGE_SIZE)
633
return -EINVAL;
634
635
if (!is_power_of_2(p->alignment + 1))
636
return -EINVAL;
637
}
638
639
if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
640
const struct ublk_param_segment *p = &ub->params.seg;
641
642
if (!is_power_of_2(p->seg_boundary_mask + 1))
643
return -EINVAL;
644
645
if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
646
return -EINVAL;
647
if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
648
return -EINVAL;
649
}
650
651
return 0;
652
}
653
654
static void ublk_apply_params(struct ublk_device *ub)
655
{
656
ublk_dev_param_basic_apply(ub);
657
658
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
659
ublk_dev_param_zoned_apply(ub);
660
}
661
662
static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
663
{
664
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
665
}
666
667
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
668
{
669
return ubq->flags & UBLK_F_AUTO_BUF_REG;
670
}
671
672
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
673
{
674
return ubq->flags & UBLK_F_USER_COPY;
675
}
676
677
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
678
{
679
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
680
!ublk_support_auto_buf_reg(ubq);
681
}
682
683
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
684
{
685
/*
686
* read()/write() is involved in user copy, so request reference
687
* has to be grabbed
688
*
689
* for zero copy, request buffer need to be registered to io_uring
690
* buffer table, so reference is needed
691
*
692
* For auto buffer register, ublk server still may issue
693
* UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
694
* so reference is required too.
695
*/
696
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
697
ublk_support_auto_buf_reg(ubq);
698
}
699
700
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
701
struct ublk_io *io)
702
{
703
if (ublk_need_req_ref(ubq))
704
refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
705
}
706
707
static inline bool ublk_get_req_ref(struct ublk_io *io)
708
{
709
return refcount_inc_not_zero(&io->ref);
710
}
711
712
static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
713
{
714
if (refcount_dec_and_test(&io->ref))
715
__ublk_complete_rq(req);
716
}
717
718
static inline bool ublk_sub_req_ref(struct ublk_io *io)
719
{
720
unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
721
722
io->task_registered_buffers = 0;
723
return refcount_sub_and_test(sub_refs, &io->ref);
724
}
725
726
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
727
{
728
return ubq->flags & UBLK_F_NEED_GET_DATA;
729
}
730
731
/* Called in slow path only, keep it noinline for trace purpose */
732
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
733
{
734
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
735
return ub;
736
return NULL;
737
}
738
739
/* Called in slow path only, keep it noinline for trace purpose */
740
static noinline void ublk_put_device(struct ublk_device *ub)
741
{
742
put_device(&ub->cdev_dev);
743
}
744
745
static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
746
int qid)
747
{
748
return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
749
}
750
751
static inline bool ublk_rq_has_data(const struct request *rq)
752
{
753
return bio_has_data(rq->bio);
754
}
755
756
static inline struct ublksrv_io_desc *
757
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
758
{
759
return ublk_get_queue(ub, q_id)->io_cmd_buf;
760
}
761
762
static inline int __ublk_queue_cmd_buf_size(int depth)
763
{
764
return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
765
}
766
767
static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
768
{
769
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
770
771
return __ublk_queue_cmd_buf_size(ubq->q_depth);
772
}
773
774
static int ublk_max_cmd_buf_size(void)
775
{
776
return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
777
}
778
779
/*
780
* Should I/O outstanding to the ublk server when it exits be reissued?
781
* If not, outstanding I/O will get errors.
782
*/
783
static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
784
{
785
return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
786
(ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
787
}
788
789
/*
790
* Should I/O issued while there is no ublk server queue? If not, I/O
791
* issued while there is no ublk server will get errors.
792
*/
793
static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
794
{
795
return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
796
!(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
797
}
798
799
/*
800
* Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
801
* of the device flags for smaller cache footprint - better for fast
802
* paths.
803
*/
804
static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
805
{
806
return (ubq->flags & UBLK_F_USER_RECOVERY) &&
807
!(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
808
}
809
810
/*
811
* Should ublk devices be stopped (i.e. no recovery possible) when the
812
* ublk server exits? If not, devices can be used again by a future
813
* incarnation of a ublk server via the start_recovery/end_recovery
814
* commands.
815
*/
816
static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
817
{
818
return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
819
}
820
821
static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
822
{
823
return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
824
ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
825
}
826
827
static void ublk_free_disk(struct gendisk *disk)
828
{
829
struct ublk_device *ub = disk->private_data;
830
831
clear_bit(UB_STATE_USED, &ub->state);
832
ublk_put_device(ub);
833
}
834
835
static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
836
unsigned int *owner_gid)
837
{
838
kuid_t uid;
839
kgid_t gid;
840
841
current_uid_gid(&uid, &gid);
842
843
*owner_uid = from_kuid(&init_user_ns, uid);
844
*owner_gid = from_kgid(&init_user_ns, gid);
845
}
846
847
static int ublk_open(struct gendisk *disk, blk_mode_t mode)
848
{
849
struct ublk_device *ub = disk->private_data;
850
851
if (capable(CAP_SYS_ADMIN))
852
return 0;
853
854
/*
855
* If it is one unprivileged device, only owner can open
856
* the disk. Otherwise it could be one trap made by one
857
* evil user who grants this disk's privileges to other
858
* users deliberately.
859
*
860
* This way is reasonable too given anyone can create
861
* unprivileged device, and no need other's grant.
862
*/
863
if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
864
unsigned int curr_uid, curr_gid;
865
866
ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
867
868
if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
869
ub->dev_info.owner_gid)
870
return -EPERM;
871
}
872
873
return 0;
874
}
875
876
static const struct block_device_operations ub_fops = {
877
.owner = THIS_MODULE,
878
.open = ublk_open,
879
.free_disk = ublk_free_disk,
880
.report_zones = ublk_report_zones,
881
};
882
883
#define UBLK_MAX_PIN_PAGES 32
884
885
struct ublk_io_iter {
886
struct page *pages[UBLK_MAX_PIN_PAGES];
887
struct bio *bio;
888
struct bvec_iter iter;
889
};
890
891
/* return how many pages are copied */
892
static void ublk_copy_io_pages(struct ublk_io_iter *data,
893
size_t total, size_t pg_off, int dir)
894
{
895
unsigned done = 0;
896
unsigned pg_idx = 0;
897
898
while (done < total) {
899
struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
900
unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
901
(unsigned)(PAGE_SIZE - pg_off));
902
void *bv_buf = bvec_kmap_local(&bv);
903
void *pg_buf = kmap_local_page(data->pages[pg_idx]);
904
905
if (dir == ITER_DEST)
906
memcpy(pg_buf + pg_off, bv_buf, bytes);
907
else
908
memcpy(bv_buf, pg_buf + pg_off, bytes);
909
910
kunmap_local(pg_buf);
911
kunmap_local(bv_buf);
912
913
/* advance page array */
914
pg_off += bytes;
915
if (pg_off == PAGE_SIZE) {
916
pg_idx += 1;
917
pg_off = 0;
918
}
919
920
done += bytes;
921
922
/* advance bio */
923
bio_advance_iter_single(data->bio, &data->iter, bytes);
924
if (!data->iter.bi_size) {
925
data->bio = data->bio->bi_next;
926
if (data->bio == NULL)
927
break;
928
data->iter = data->bio->bi_iter;
929
}
930
}
931
}
932
933
static bool ublk_advance_io_iter(const struct request *req,
934
struct ublk_io_iter *iter, unsigned int offset)
935
{
936
struct bio *bio = req->bio;
937
938
for_each_bio(bio) {
939
if (bio->bi_iter.bi_size > offset) {
940
iter->bio = bio;
941
iter->iter = bio->bi_iter;
942
bio_advance_iter(iter->bio, &iter->iter, offset);
943
return true;
944
}
945
offset -= bio->bi_iter.bi_size;
946
}
947
return false;
948
}
949
950
/*
951
* Copy data between request pages and io_iter, and 'offset'
952
* is the start point of linear offset of request.
953
*/
954
static size_t ublk_copy_user_pages(const struct request *req,
955
unsigned offset, struct iov_iter *uiter, int dir)
956
{
957
struct ublk_io_iter iter;
958
size_t done = 0;
959
960
if (!ublk_advance_io_iter(req, &iter, offset))
961
return 0;
962
963
while (iov_iter_count(uiter) && iter.bio) {
964
unsigned nr_pages;
965
ssize_t len;
966
size_t off;
967
int i;
968
969
len = iov_iter_get_pages2(uiter, iter.pages,
970
iov_iter_count(uiter),
971
UBLK_MAX_PIN_PAGES, &off);
972
if (len <= 0)
973
return done;
974
975
ublk_copy_io_pages(&iter, len, off, dir);
976
nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
977
for (i = 0; i < nr_pages; i++) {
978
if (dir == ITER_DEST)
979
set_page_dirty(iter.pages[i]);
980
put_page(iter.pages[i]);
981
}
982
done += len;
983
}
984
985
return done;
986
}
987
988
static inline bool ublk_need_map_req(const struct request *req)
989
{
990
return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
991
}
992
993
static inline bool ublk_need_unmap_req(const struct request *req)
994
{
995
return ublk_rq_has_data(req) &&
996
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
997
}
998
999
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
1000
const struct ublk_io *io)
1001
{
1002
const unsigned int rq_bytes = blk_rq_bytes(req);
1003
1004
if (!ublk_need_map_io(ubq))
1005
return rq_bytes;
1006
1007
/*
1008
* no zero copy, we delay copy WRITE request data into ublksrv
1009
* context and the big benefit is that pinning pages in current
1010
* context is pretty fast, see ublk_pin_user_pages
1011
*/
1012
if (ublk_need_map_req(req)) {
1013
struct iov_iter iter;
1014
const int dir = ITER_DEST;
1015
1016
import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
1017
return ublk_copy_user_pages(req, 0, &iter, dir);
1018
}
1019
return rq_bytes;
1020
}
1021
1022
static int ublk_unmap_io(const struct ublk_queue *ubq,
1023
const struct request *req,
1024
const struct ublk_io *io)
1025
{
1026
const unsigned int rq_bytes = blk_rq_bytes(req);
1027
1028
if (!ublk_need_map_io(ubq))
1029
return rq_bytes;
1030
1031
if (ublk_need_unmap_req(req)) {
1032
struct iov_iter iter;
1033
const int dir = ITER_SOURCE;
1034
1035
WARN_ON_ONCE(io->res > rq_bytes);
1036
1037
import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
1038
return ublk_copy_user_pages(req, 0, &iter, dir);
1039
}
1040
return rq_bytes;
1041
}
1042
1043
static inline unsigned int ublk_req_build_flags(struct request *req)
1044
{
1045
unsigned flags = 0;
1046
1047
if (req->cmd_flags & REQ_FAILFAST_DEV)
1048
flags |= UBLK_IO_F_FAILFAST_DEV;
1049
1050
if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1051
flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1052
1053
if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1054
flags |= UBLK_IO_F_FAILFAST_DRIVER;
1055
1056
if (req->cmd_flags & REQ_META)
1057
flags |= UBLK_IO_F_META;
1058
1059
if (req->cmd_flags & REQ_FUA)
1060
flags |= UBLK_IO_F_FUA;
1061
1062
if (req->cmd_flags & REQ_NOUNMAP)
1063
flags |= UBLK_IO_F_NOUNMAP;
1064
1065
if (req->cmd_flags & REQ_SWAP)
1066
flags |= UBLK_IO_F_SWAP;
1067
1068
return flags;
1069
}
1070
1071
static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1072
{
1073
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1074
struct ublk_io *io = &ubq->ios[req->tag];
1075
enum req_op op = req_op(req);
1076
u32 ublk_op;
1077
1078
if (!ublk_queue_is_zoned(ubq) &&
1079
(op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
1080
return BLK_STS_IOERR;
1081
1082
switch (req_op(req)) {
1083
case REQ_OP_READ:
1084
ublk_op = UBLK_IO_OP_READ;
1085
break;
1086
case REQ_OP_WRITE:
1087
ublk_op = UBLK_IO_OP_WRITE;
1088
break;
1089
case REQ_OP_FLUSH:
1090
ublk_op = UBLK_IO_OP_FLUSH;
1091
break;
1092
case REQ_OP_DISCARD:
1093
ublk_op = UBLK_IO_OP_DISCARD;
1094
break;
1095
case REQ_OP_WRITE_ZEROES:
1096
ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1097
break;
1098
default:
1099
if (ublk_queue_is_zoned(ubq))
1100
return ublk_setup_iod_zoned(ubq, req);
1101
return BLK_STS_IOERR;
1102
}
1103
1104
/* need to translate since kernel may change */
1105
iod->op_flags = ublk_op | ublk_req_build_flags(req);
1106
iod->nr_sectors = blk_rq_sectors(req);
1107
iod->start_sector = blk_rq_pos(req);
1108
iod->addr = io->addr;
1109
1110
return BLK_STS_OK;
1111
}
1112
1113
static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1114
struct io_uring_cmd *ioucmd)
1115
{
1116
return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1117
}
1118
1119
/* todo: handle partial completion */
1120
static inline void __ublk_complete_rq(struct request *req)
1121
{
1122
struct ublk_queue *ubq = req->mq_hctx->driver_data;
1123
struct ublk_io *io = &ubq->ios[req->tag];
1124
unsigned int unmapped_bytes;
1125
blk_status_t res = BLK_STS_OK;
1126
1127
/* failed read IO if nothing is read */
1128
if (!io->res && req_op(req) == REQ_OP_READ)
1129
io->res = -EIO;
1130
1131
if (io->res < 0) {
1132
res = errno_to_blk_status(io->res);
1133
goto exit;
1134
}
1135
1136
/*
1137
* FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1138
* directly.
1139
*
1140
* Both the two needn't unmap.
1141
*/
1142
if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1143
req_op(req) != REQ_OP_DRV_IN)
1144
goto exit;
1145
1146
/* for READ request, writing data in iod->addr to rq buffers */
1147
unmapped_bytes = ublk_unmap_io(ubq, req, io);
1148
1149
/*
1150
* Extremely impossible since we got data filled in just before
1151
*
1152
* Re-read simply for this unlikely case.
1153
*/
1154
if (unlikely(unmapped_bytes < io->res))
1155
io->res = unmapped_bytes;
1156
1157
if (blk_update_request(req, BLK_STS_OK, io->res))
1158
blk_mq_requeue_request(req, true);
1159
else if (likely(!blk_should_fake_timeout(req->q)))
1160
__blk_mq_end_request(req, BLK_STS_OK);
1161
1162
return;
1163
exit:
1164
blk_mq_end_request(req, res);
1165
}
1166
1167
static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1168
struct request *req)
1169
{
1170
/* read cmd first because req will overwrite it */
1171
struct io_uring_cmd *cmd = io->cmd;
1172
1173
/* mark this cmd owned by ublksrv */
1174
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1175
1176
/*
1177
* clear ACTIVE since we are done with this sqe/cmd slot
1178
* We can only accept io cmd in case of being not active.
1179
*/
1180
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1181
1182
io->req = req;
1183
return cmd;
1184
}
1185
1186
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1187
int res, unsigned issue_flags)
1188
{
1189
struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1190
1191
/* tell ublksrv one io request is coming */
1192
io_uring_cmd_done(cmd, res, 0, issue_flags);
1193
}
1194
1195
#define UBLK_REQUEUE_DELAY_MS 3
1196
1197
static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1198
struct request *rq)
1199
{
1200
/* We cannot process this rq so just requeue it. */
1201
if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1202
blk_mq_requeue_request(rq, false);
1203
else
1204
blk_mq_end_request(rq, BLK_STS_IOERR);
1205
}
1206
1207
static void
1208
ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io)
1209
{
1210
unsigned tag = io - ubq->ios;
1211
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1212
1213
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1214
}
1215
1216
static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
1217
struct ublk_io *io, unsigned int issue_flags)
1218
{
1219
int ret;
1220
1221
ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
1222
io->buf.index, issue_flags);
1223
if (ret) {
1224
if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1225
ublk_auto_buf_reg_fallback(ubq, io);
1226
return true;
1227
}
1228
blk_mq_end_request(req, BLK_STS_IOERR);
1229
return false;
1230
}
1231
1232
io->task_registered_buffers = 1;
1233
io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
1234
io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1235
return true;
1236
}
1237
1238
static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
1239
struct request *req, struct ublk_io *io,
1240
unsigned int issue_flags)
1241
{
1242
ublk_init_req_ref(ubq, io);
1243
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
1244
return ublk_auto_buf_reg(ubq, req, io, issue_flags);
1245
1246
return true;
1247
}
1248
1249
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1250
struct ublk_io *io)
1251
{
1252
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1253
1254
/* partially mapped, update io descriptor */
1255
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1256
/*
1257
* Nothing mapped, retry until we succeed.
1258
*
1259
* We may never succeed in mapping any bytes here because
1260
* of OOM. TODO: reserve one buffer with single page pinned
1261
* for providing forward progress guarantee.
1262
*/
1263
if (unlikely(!mapped_bytes)) {
1264
blk_mq_requeue_request(req, false);
1265
blk_mq_delay_kick_requeue_list(req->q,
1266
UBLK_REQUEUE_DELAY_MS);
1267
return false;
1268
}
1269
1270
ublk_get_iod(ubq, req->tag)->nr_sectors =
1271
mapped_bytes >> 9;
1272
}
1273
1274
return true;
1275
}
1276
1277
static void ublk_dispatch_req(struct ublk_queue *ubq,
1278
struct request *req,
1279
unsigned int issue_flags)
1280
{
1281
int tag = req->tag;
1282
struct ublk_io *io = &ubq->ios[tag];
1283
1284
pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1285
__func__, ubq->q_id, req->tag, io->flags,
1286
ublk_get_iod(ubq, req->tag)->addr);
1287
1288
/*
1289
* Task is exiting if either:
1290
*
1291
* (1) current != io->task.
1292
* io_uring_cmd_complete_in_task() tries to run task_work
1293
* in a workqueue if cmd's task is PF_EXITING.
1294
*
1295
* (2) current->flags & PF_EXITING.
1296
*/
1297
if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1298
__ublk_abort_rq(ubq, req);
1299
return;
1300
}
1301
1302
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1303
/*
1304
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
1305
* so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1306
* and notify it.
1307
*/
1308
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1309
pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1310
__func__, ubq->q_id, req->tag, io->flags);
1311
ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1312
issue_flags);
1313
return;
1314
}
1315
1316
if (!ublk_start_io(ubq, req, io))
1317
return;
1318
1319
if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
1320
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1321
}
1322
1323
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
1324
unsigned int issue_flags)
1325
{
1326
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1327
struct ublk_queue *ubq = pdu->ubq;
1328
1329
ublk_dispatch_req(ubq, pdu->req, issue_flags);
1330
}
1331
1332
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1333
{
1334
struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
1335
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1336
1337
pdu->req = rq;
1338
io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
1339
}
1340
1341
static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
1342
unsigned int issue_flags)
1343
{
1344
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1345
struct request *rq = pdu->req_list;
1346
struct request *next;
1347
1348
do {
1349
next = rq->rq_next;
1350
rq->rq_next = NULL;
1351
ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags);
1352
rq = next;
1353
} while (rq);
1354
}
1355
1356
static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
1357
{
1358
struct io_uring_cmd *cmd = io->cmd;
1359
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1360
1361
pdu->req_list = rq_list_peek(l);
1362
rq_list_init(l);
1363
io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
1364
}
1365
1366
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1367
{
1368
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1369
pid_t tgid = ubq->dev->ublksrv_tgid;
1370
struct task_struct *p;
1371
struct pid *pid;
1372
1373
if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
1374
return BLK_EH_RESET_TIMER;
1375
1376
if (unlikely(!tgid))
1377
return BLK_EH_RESET_TIMER;
1378
1379
rcu_read_lock();
1380
pid = find_vpid(tgid);
1381
p = pid_task(pid, PIDTYPE_PID);
1382
if (p)
1383
send_sig(SIGKILL, p, 0);
1384
rcu_read_unlock();
1385
return BLK_EH_DONE;
1386
}
1387
1388
static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
1389
bool check_cancel)
1390
{
1391
blk_status_t res;
1392
1393
if (unlikely(READ_ONCE(ubq->fail_io)))
1394
return BLK_STS_TARGET;
1395
1396
/* With recovery feature enabled, force_abort is set in
1397
* ublk_stop_dev() before calling del_gendisk(). We have to
1398
* abort all requeued and new rqs here to let del_gendisk()
1399
* move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1400
* to avoid UAF on io_uring ctx.
1401
*
1402
* Note: force_abort is guaranteed to be seen because it is set
1403
* before request queue is unqiuesced.
1404
*/
1405
if (ublk_nosrv_should_queue_io(ubq) &&
1406
unlikely(READ_ONCE(ubq->force_abort)))
1407
return BLK_STS_IOERR;
1408
1409
if (check_cancel && unlikely(ubq->canceling))
1410
return BLK_STS_IOERR;
1411
1412
/* fill iod to slot in io cmd buffer */
1413
res = ublk_setup_iod(ubq, rq);
1414
if (unlikely(res != BLK_STS_OK))
1415
return BLK_STS_IOERR;
1416
1417
blk_mq_start_request(rq);
1418
return BLK_STS_OK;
1419
}
1420
1421
static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
1422
const struct blk_mq_queue_data *bd)
1423
{
1424
struct ublk_queue *ubq = hctx->driver_data;
1425
struct request *rq = bd->rq;
1426
blk_status_t res;
1427
1428
res = ublk_prep_req(ubq, rq, false);
1429
if (res != BLK_STS_OK)
1430
return res;
1431
1432
/*
1433
* ->canceling has to be handled after ->force_abort and ->fail_io
1434
* is dealt with, otherwise this request may not be failed in case
1435
* of recovery, and cause hang when deleting disk
1436
*/
1437
if (unlikely(ubq->canceling)) {
1438
__ublk_abort_rq(ubq, rq);
1439
return BLK_STS_OK;
1440
}
1441
1442
ublk_queue_cmd(ubq, rq);
1443
return BLK_STS_OK;
1444
}
1445
1446
static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
1447
const struct ublk_io *io2)
1448
{
1449
return (io_uring_cmd_ctx_handle(io->cmd) ==
1450
io_uring_cmd_ctx_handle(io2->cmd)) &&
1451
(io->task == io2->task);
1452
}
1453
1454
static void ublk_queue_rqs(struct rq_list *rqlist)
1455
{
1456
struct rq_list requeue_list = { };
1457
struct rq_list submit_list = { };
1458
struct ublk_io *io = NULL;
1459
struct request *req;
1460
1461
while ((req = rq_list_pop(rqlist))) {
1462
struct ublk_queue *this_q = req->mq_hctx->driver_data;
1463
struct ublk_io *this_io = &this_q->ios[req->tag];
1464
1465
if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
1466
rq_list_add_tail(&requeue_list, req);
1467
continue;
1468
}
1469
1470
if (io && !ublk_belong_to_same_batch(io, this_io) &&
1471
!rq_list_empty(&submit_list))
1472
ublk_queue_cmd_list(io, &submit_list);
1473
io = this_io;
1474
rq_list_add_tail(&submit_list, req);
1475
}
1476
1477
if (!rq_list_empty(&submit_list))
1478
ublk_queue_cmd_list(io, &submit_list);
1479
*rqlist = requeue_list;
1480
}
1481
1482
static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1483
unsigned int hctx_idx)
1484
{
1485
struct ublk_device *ub = driver_data;
1486
struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
1487
1488
hctx->driver_data = ubq;
1489
return 0;
1490
}
1491
1492
static const struct blk_mq_ops ublk_mq_ops = {
1493
.queue_rq = ublk_queue_rq,
1494
.queue_rqs = ublk_queue_rqs,
1495
.init_hctx = ublk_init_hctx,
1496
.timeout = ublk_timeout,
1497
};
1498
1499
static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
1500
{
1501
int i;
1502
1503
/* All old ioucmds have to be completed */
1504
ubq->nr_io_ready = 0;
1505
1506
for (i = 0; i < ubq->q_depth; i++) {
1507
struct ublk_io *io = &ubq->ios[i];
1508
1509
/*
1510
* UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
1511
* io->cmd
1512
*/
1513
io->flags &= UBLK_IO_FLAG_CANCELED;
1514
io->cmd = NULL;
1515
io->addr = 0;
1516
1517
/*
1518
* old task is PF_EXITING, put it now
1519
*
1520
* It could be NULL in case of closing one quiesced
1521
* device.
1522
*/
1523
if (io->task) {
1524
put_task_struct(io->task);
1525
io->task = NULL;
1526
}
1527
1528
WARN_ON_ONCE(refcount_read(&io->ref));
1529
WARN_ON_ONCE(io->task_registered_buffers);
1530
}
1531
}
1532
1533
static int ublk_ch_open(struct inode *inode, struct file *filp)
1534
{
1535
struct ublk_device *ub = container_of(inode->i_cdev,
1536
struct ublk_device, cdev);
1537
1538
if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
1539
return -EBUSY;
1540
filp->private_data = ub;
1541
ub->ublksrv_tgid = current->tgid;
1542
return 0;
1543
}
1544
1545
static void ublk_reset_ch_dev(struct ublk_device *ub)
1546
{
1547
int i;
1548
1549
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1550
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
1551
1552
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
1553
ub->mm = NULL;
1554
ub->nr_queues_ready = 0;
1555
ub->unprivileged_daemons = false;
1556
ub->ublksrv_tgid = -1;
1557
}
1558
1559
static struct gendisk *ublk_get_disk(struct ublk_device *ub)
1560
{
1561
struct gendisk *disk;
1562
1563
spin_lock(&ub->lock);
1564
disk = ub->ub_disk;
1565
if (disk)
1566
get_device(disk_to_dev(disk));
1567
spin_unlock(&ub->lock);
1568
1569
return disk;
1570
}
1571
1572
static void ublk_put_disk(struct gendisk *disk)
1573
{
1574
if (disk)
1575
put_device(disk_to_dev(disk));
1576
}
1577
1578
/*
1579
* Use this function to ensure that ->canceling is consistently set for
1580
* the device and all queues. Do not set these flags directly.
1581
*
1582
* Caller must ensure that:
1583
* - cancel_mutex is held. This ensures that there is no concurrent
1584
* access to ub->canceling and no concurrent writes to ubq->canceling.
1585
* - there are no concurrent reads of ubq->canceling from the queue_rq
1586
* path. This can be done by quiescing the queue, or through other
1587
* means.
1588
*/
1589
static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
1590
__must_hold(&ub->cancel_mutex)
1591
{
1592
int i;
1593
1594
ub->canceling = canceling;
1595
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1596
ublk_get_queue(ub, i)->canceling = canceling;
1597
}
1598
1599
static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
1600
{
1601
int i, j;
1602
1603
if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY |
1604
UBLK_F_AUTO_BUF_REG)))
1605
return false;
1606
1607
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1608
struct ublk_queue *ubq = ublk_get_queue(ub, i);
1609
1610
for (j = 0; j < ubq->q_depth; j++) {
1611
struct ublk_io *io = &ubq->ios[j];
1612
unsigned int refs = refcount_read(&io->ref) +
1613
io->task_registered_buffers;
1614
1615
/*
1616
* UBLK_REFCOUNT_INIT or zero means no active
1617
* reference
1618
*/
1619
if (refs != UBLK_REFCOUNT_INIT && refs != 0)
1620
return true;
1621
1622
/* reset to zero if the io hasn't active references */
1623
refcount_set(&io->ref, 0);
1624
io->task_registered_buffers = 0;
1625
}
1626
}
1627
return false;
1628
}
1629
1630
static void ublk_ch_release_work_fn(struct work_struct *work)
1631
{
1632
struct ublk_device *ub =
1633
container_of(work, struct ublk_device, exit_work.work);
1634
struct gendisk *disk;
1635
int i;
1636
1637
/*
1638
* For zero-copy and auto buffer register modes, I/O references
1639
* might not be dropped naturally when the daemon is killed, but
1640
* io_uring guarantees that registered bvec kernel buffers are
1641
* unregistered finally when freeing io_uring context, then the
1642
* active references are dropped.
1643
*
1644
* Wait until active references are dropped for avoiding use-after-free
1645
*
1646
* registered buffer may be unregistered in io_ring's release hander,
1647
* so have to wait by scheduling work function for avoiding the two
1648
* file release dependency.
1649
*/
1650
if (ublk_check_and_reset_active_ref(ub)) {
1651
schedule_delayed_work(&ub->exit_work, 1);
1652
return;
1653
}
1654
1655
/*
1656
* disk isn't attached yet, either device isn't live, or it has
1657
* been removed already, so we needn't to do anything
1658
*/
1659
disk = ublk_get_disk(ub);
1660
if (!disk)
1661
goto out;
1662
1663
/*
1664
* All uring_cmd are done now, so abort any request outstanding to
1665
* the ublk server
1666
*
1667
* This can be done in lockless way because ublk server has been
1668
* gone
1669
*
1670
* More importantly, we have to provide forward progress guarantee
1671
* without holding ub->mutex, otherwise control task grabbing
1672
* ub->mutex triggers deadlock
1673
*
1674
* All requests may be inflight, so ->canceling may not be set, set
1675
* it now.
1676
*/
1677
mutex_lock(&ub->cancel_mutex);
1678
ublk_set_canceling(ub, true);
1679
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1680
ublk_abort_queue(ub, ublk_get_queue(ub, i));
1681
mutex_unlock(&ub->cancel_mutex);
1682
blk_mq_kick_requeue_list(disk->queue);
1683
1684
/*
1685
* All infligh requests have been completed or requeued and any new
1686
* request will be failed or requeued via `->canceling` now, so it is
1687
* fine to grab ub->mutex now.
1688
*/
1689
mutex_lock(&ub->mutex);
1690
1691
/* double check after grabbing lock */
1692
if (!ub->ub_disk)
1693
goto unlock;
1694
1695
/*
1696
* Transition the device to the nosrv state. What exactly this
1697
* means depends on the recovery flags
1698
*/
1699
if (ublk_nosrv_should_stop_dev(ub)) {
1700
/*
1701
* Allow any pending/future I/O to pass through quickly
1702
* with an error. This is needed because del_gendisk
1703
* waits for all pending I/O to complete
1704
*/
1705
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1706
WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
1707
1708
ublk_stop_dev_unlocked(ub);
1709
} else {
1710
if (ublk_nosrv_dev_should_queue_io(ub)) {
1711
/* ->canceling is set and all requests are aborted */
1712
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1713
} else {
1714
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
1715
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1716
WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
1717
}
1718
}
1719
unlock:
1720
mutex_unlock(&ub->mutex);
1721
ublk_put_disk(disk);
1722
1723
/* all uring_cmd has been done now, reset device & ubq */
1724
ublk_reset_ch_dev(ub);
1725
out:
1726
clear_bit(UB_STATE_OPEN, &ub->state);
1727
1728
/* put the reference grabbed in ublk_ch_release() */
1729
ublk_put_device(ub);
1730
}
1731
1732
static int ublk_ch_release(struct inode *inode, struct file *filp)
1733
{
1734
struct ublk_device *ub = filp->private_data;
1735
1736
/*
1737
* Grab ublk device reference, so it won't be gone until we are
1738
* really released from work function.
1739
*/
1740
ublk_get_device(ub);
1741
1742
INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
1743
schedule_delayed_work(&ub->exit_work, 0);
1744
return 0;
1745
}
1746
1747
/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
1748
static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
1749
{
1750
struct ublk_device *ub = filp->private_data;
1751
size_t sz = vma->vm_end - vma->vm_start;
1752
unsigned max_sz = ublk_max_cmd_buf_size();
1753
unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
1754
int q_id, ret = 0;
1755
1756
spin_lock(&ub->lock);
1757
if (!ub->mm)
1758
ub->mm = current->mm;
1759
if (current->mm != ub->mm)
1760
ret = -EINVAL;
1761
spin_unlock(&ub->lock);
1762
1763
if (ret)
1764
return ret;
1765
1766
if (vma->vm_flags & VM_WRITE)
1767
return -EPERM;
1768
1769
end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
1770
if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
1771
return -EINVAL;
1772
1773
q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
1774
pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
1775
__func__, q_id, current->pid, vma->vm_start,
1776
phys_off, (unsigned long)sz);
1777
1778
if (sz != ublk_queue_cmd_buf_size(ub, q_id))
1779
return -EINVAL;
1780
1781
pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
1782
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1783
}
1784
1785
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
1786
struct request *req)
1787
{
1788
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
1789
1790
if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
1791
blk_mq_requeue_request(req, false);
1792
else {
1793
io->res = -EIO;
1794
__ublk_complete_rq(req);
1795
}
1796
}
1797
1798
/*
1799
* Called from ublk char device release handler, when any uring_cmd is
1800
* done, meantime request queue is "quiesced" since all inflight requests
1801
* can't be completed because ublk server is dead.
1802
*
1803
* So no one can hold our request IO reference any more, simply ignore the
1804
* reference, and complete the request immediately
1805
*/
1806
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
1807
{
1808
int i;
1809
1810
for (i = 0; i < ubq->q_depth; i++) {
1811
struct ublk_io *io = &ubq->ios[i];
1812
1813
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1814
__ublk_fail_req(ubq, io, io->req);
1815
}
1816
}
1817
1818
static void ublk_start_cancel(struct ublk_device *ub)
1819
{
1820
struct gendisk *disk = ublk_get_disk(ub);
1821
1822
/* Our disk has been dead */
1823
if (!disk)
1824
return;
1825
1826
mutex_lock(&ub->cancel_mutex);
1827
if (ub->canceling)
1828
goto out;
1829
/*
1830
* Now we are serialized with ublk_queue_rq()
1831
*
1832
* Make sure that ubq->canceling is set when queue is frozen,
1833
* because ublk_queue_rq() has to rely on this flag for avoiding to
1834
* touch completed uring_cmd
1835
*/
1836
blk_mq_quiesce_queue(disk->queue);
1837
ublk_set_canceling(ub, true);
1838
blk_mq_unquiesce_queue(disk->queue);
1839
out:
1840
mutex_unlock(&ub->cancel_mutex);
1841
ublk_put_disk(disk);
1842
}
1843
1844
static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
1845
unsigned int issue_flags)
1846
{
1847
struct ublk_io *io = &ubq->ios[tag];
1848
struct ublk_device *ub = ubq->dev;
1849
struct request *req;
1850
bool done;
1851
1852
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1853
return;
1854
1855
/*
1856
* Don't try to cancel this command if the request is started for
1857
* avoiding race between io_uring_cmd_done() and
1858
* io_uring_cmd_complete_in_task().
1859
*
1860
* Either the started request will be aborted via __ublk_abort_rq(),
1861
* then this uring_cmd is canceled next time, or it will be done in
1862
* task work function ublk_dispatch_req() because io_uring guarantees
1863
* that ublk_dispatch_req() is always called
1864
*/
1865
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1866
if (req && blk_mq_request_started(req) && req->tag == tag)
1867
return;
1868
1869
spin_lock(&ubq->cancel_lock);
1870
done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1871
if (!done)
1872
io->flags |= UBLK_IO_FLAG_CANCELED;
1873
spin_unlock(&ubq->cancel_lock);
1874
1875
if (!done)
1876
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
1877
}
1878
1879
/*
1880
* The ublk char device won't be closed when calling cancel fn, so both
1881
* ublk device and queue are guaranteed to be live
1882
*
1883
* Two-stage cancel:
1884
*
1885
* - make every active uring_cmd done in ->cancel_fn()
1886
*
1887
* - aborting inflight ublk IO requests in ublk char device release handler,
1888
* which depends on 1st stage because device can only be closed iff all
1889
* uring_cmd are done
1890
*
1891
* Do _not_ try to acquire ub->mutex before all inflight requests are
1892
* aborted, otherwise deadlock may be caused.
1893
*/
1894
static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1895
unsigned int issue_flags)
1896
{
1897
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1898
struct ublk_queue *ubq = pdu->ubq;
1899
struct task_struct *task;
1900
struct ublk_io *io;
1901
1902
if (WARN_ON_ONCE(!ubq))
1903
return;
1904
1905
if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1906
return;
1907
1908
task = io_uring_cmd_get_task(cmd);
1909
io = &ubq->ios[pdu->tag];
1910
if (WARN_ON_ONCE(task && task != io->task))
1911
return;
1912
1913
ublk_start_cancel(ubq->dev);
1914
1915
WARN_ON_ONCE(io->cmd != cmd);
1916
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
1917
}
1918
1919
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1920
{
1921
return ubq->nr_io_ready == ubq->q_depth;
1922
}
1923
1924
static void ublk_cancel_queue(struct ublk_queue *ubq)
1925
{
1926
int i;
1927
1928
for (i = 0; i < ubq->q_depth; i++)
1929
ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
1930
}
1931
1932
/* Cancel all pending commands, must be called after del_gendisk() returns */
1933
static void ublk_cancel_dev(struct ublk_device *ub)
1934
{
1935
int i;
1936
1937
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1938
ublk_cancel_queue(ublk_get_queue(ub, i));
1939
}
1940
1941
static bool ublk_check_inflight_rq(struct request *rq, void *data)
1942
{
1943
bool *idle = data;
1944
1945
if (blk_mq_request_started(rq)) {
1946
*idle = false;
1947
return false;
1948
}
1949
return true;
1950
}
1951
1952
static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1953
{
1954
bool idle;
1955
1956
WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1957
while (true) {
1958
idle = true;
1959
blk_mq_tagset_busy_iter(&ub->tag_set,
1960
ublk_check_inflight_rq, &idle);
1961
if (idle)
1962
break;
1963
msleep(UBLK_REQUEUE_DELAY_MS);
1964
}
1965
}
1966
1967
static void ublk_force_abort_dev(struct ublk_device *ub)
1968
{
1969
int i;
1970
1971
pr_devel("%s: force abort ub: dev_id %d state %s\n",
1972
__func__, ub->dev_info.dev_id,
1973
ub->dev_info.state == UBLK_S_DEV_LIVE ?
1974
"LIVE" : "QUIESCED");
1975
blk_mq_quiesce_queue(ub->ub_disk->queue);
1976
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1977
ublk_wait_tagset_rqs_idle(ub);
1978
1979
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1980
ublk_get_queue(ub, i)->force_abort = true;
1981
blk_mq_unquiesce_queue(ub->ub_disk->queue);
1982
/* We may have requeued some rqs in ublk_quiesce_queue() */
1983
blk_mq_kick_requeue_list(ub->ub_disk->queue);
1984
}
1985
1986
static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
1987
{
1988
struct gendisk *disk;
1989
1990
/* Sync with ublk_abort_queue() by holding the lock */
1991
spin_lock(&ub->lock);
1992
disk = ub->ub_disk;
1993
ub->dev_info.state = UBLK_S_DEV_DEAD;
1994
ub->dev_info.ublksrv_pid = -1;
1995
ub->ub_disk = NULL;
1996
spin_unlock(&ub->lock);
1997
1998
return disk;
1999
}
2000
2001
static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2002
__must_hold(&ub->mutex)
2003
{
2004
struct gendisk *disk;
2005
2006
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2007
return;
2008
2009
if (ublk_nosrv_dev_should_queue_io(ub))
2010
ublk_force_abort_dev(ub);
2011
del_gendisk(ub->ub_disk);
2012
disk = ublk_detach_disk(ub);
2013
put_disk(disk);
2014
}
2015
2016
static void ublk_stop_dev(struct ublk_device *ub)
2017
{
2018
mutex_lock(&ub->mutex);
2019
ublk_stop_dev_unlocked(ub);
2020
mutex_unlock(&ub->mutex);
2021
ublk_cancel_dev(ub);
2022
}
2023
2024
/* reset ublk io_uring queue & io flags */
2025
static void ublk_reset_io_flags(struct ublk_device *ub)
2026
{
2027
int i, j;
2028
2029
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2030
struct ublk_queue *ubq = ublk_get_queue(ub, i);
2031
2032
/* UBLK_IO_FLAG_CANCELED can be cleared now */
2033
spin_lock(&ubq->cancel_lock);
2034
for (j = 0; j < ubq->q_depth; j++)
2035
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2036
spin_unlock(&ubq->cancel_lock);
2037
ubq->fail_io = false;
2038
}
2039
mutex_lock(&ub->cancel_mutex);
2040
ublk_set_canceling(ub, false);
2041
mutex_unlock(&ub->cancel_mutex);
2042
}
2043
2044
/* device can only be started after all IOs are ready */
2045
static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
2046
__must_hold(&ub->mutex)
2047
{
2048
ubq->nr_io_ready++;
2049
if (ublk_queue_ready(ubq))
2050
ub->nr_queues_ready++;
2051
if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2052
ub->unprivileged_daemons = true;
2053
2054
if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
2055
/* now we are ready for handling ublk io request */
2056
ublk_reset_io_flags(ub);
2057
complete_all(&ub->completion);
2058
}
2059
}
2060
2061
static inline int ublk_check_cmd_op(u32 cmd_op)
2062
{
2063
u32 ioc_type = _IOC_TYPE(cmd_op);
2064
2065
if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2066
return -EOPNOTSUPP;
2067
2068
if (ioc_type != 'u' && ioc_type != 0)
2069
return -EOPNOTSUPP;
2070
2071
return 0;
2072
}
2073
2074
static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2075
{
2076
io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2077
2078
if (io->buf.reserved0 || io->buf.reserved1)
2079
return -EINVAL;
2080
2081
if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2082
return -EINVAL;
2083
return 0;
2084
}
2085
2086
static int ublk_handle_auto_buf_reg(struct ublk_io *io,
2087
struct io_uring_cmd *cmd,
2088
u16 *buf_idx)
2089
{
2090
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
2091
io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
2092
2093
/*
2094
* `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
2095
* and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
2096
* `io_ring_ctx`.
2097
*
2098
* If this uring_cmd's io_ring_ctx isn't same with the
2099
* one for registering the buffer, it is ublk server's
2100
* responsibility for unregistering the buffer, otherwise
2101
* this ublk request gets stuck.
2102
*/
2103
if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
2104
*buf_idx = io->buf.index;
2105
}
2106
2107
return ublk_set_auto_buf_reg(io, cmd);
2108
}
2109
2110
/* Once we return, `io->req` can't be used any more */
2111
static inline struct request *
2112
ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
2113
{
2114
struct request *req = io->req;
2115
2116
io->cmd = cmd;
2117
io->flags |= UBLK_IO_FLAG_ACTIVE;
2118
/* now this cmd slot is owned by ublk driver */
2119
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
2120
2121
return req;
2122
}
2123
2124
static inline int
2125
ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io,
2126
struct io_uring_cmd *cmd, unsigned long buf_addr,
2127
u16 *buf_idx)
2128
{
2129
if (ublk_support_auto_buf_reg(ubq))
2130
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
2131
2132
io->addr = buf_addr;
2133
return 0;
2134
}
2135
2136
static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
2137
unsigned int issue_flags,
2138
struct ublk_queue *ubq, unsigned int tag)
2139
{
2140
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2141
2142
/*
2143
* Safe to refer to @ubq since ublk_queue won't be died until its
2144
* commands are completed
2145
*/
2146
pdu->ubq = ubq;
2147
pdu->tag = tag;
2148
io_uring_cmd_mark_cancelable(cmd, issue_flags);
2149
}
2150
2151
static void ublk_io_release(void *priv)
2152
{
2153
struct request *rq = priv;
2154
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2155
struct ublk_io *io = &ubq->ios[rq->tag];
2156
2157
/*
2158
* task_registered_buffers may be 0 if buffers were registered off task
2159
* but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
2160
*/
2161
if (current == io->task && io->task_registered_buffers)
2162
io->task_registered_buffers--;
2163
else
2164
ublk_put_req_ref(io, rq);
2165
}
2166
2167
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
2168
const struct ublk_queue *ubq,
2169
struct ublk_io *io,
2170
unsigned int index, unsigned int issue_flags)
2171
{
2172
struct ublk_device *ub = cmd->file->private_data;
2173
struct request *req;
2174
int ret;
2175
2176
if (!ublk_support_zero_copy(ubq))
2177
return -EINVAL;
2178
2179
req = __ublk_check_and_get_req(ub, ubq, io, 0);
2180
if (!req)
2181
return -EINVAL;
2182
2183
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2184
issue_flags);
2185
if (ret) {
2186
ublk_put_req_ref(io, req);
2187
return ret;
2188
}
2189
2190
return 0;
2191
}
2192
2193
static int
2194
ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
2195
const struct ublk_queue *ubq, struct ublk_io *io,
2196
unsigned index, unsigned issue_flags)
2197
{
2198
unsigned new_registered_buffers;
2199
struct request *req = io->req;
2200
int ret;
2201
2202
/*
2203
* Ensure there are still references for ublk_sub_req_ref() to release.
2204
* If not, fall back on the thread-safe buffer registration.
2205
*/
2206
new_registered_buffers = io->task_registered_buffers + 1;
2207
if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
2208
return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
2209
2210
if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
2211
return -EINVAL;
2212
2213
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2214
issue_flags);
2215
if (ret)
2216
return ret;
2217
2218
io->task_registered_buffers = new_registered_buffers;
2219
return 0;
2220
}
2221
2222
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
2223
const struct ublk_device *ub,
2224
unsigned int index, unsigned int issue_flags)
2225
{
2226
if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
2227
return -EINVAL;
2228
2229
return io_buffer_unregister_bvec(cmd, index, issue_flags);
2230
}
2231
2232
static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
2233
{
2234
if (ublk_need_map_io(ubq)) {
2235
/*
2236
* FETCH_RQ has to provide IO buffer if NEED GET
2237
* DATA is not enabled
2238
*/
2239
if (!buf_addr && !ublk_need_get_data(ubq))
2240
return -EINVAL;
2241
} else if (buf_addr) {
2242
/* User copy requires addr to be unset */
2243
return -EINVAL;
2244
}
2245
return 0;
2246
}
2247
2248
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
2249
struct ublk_io *io, __u64 buf_addr)
2250
{
2251
struct ublk_device *ub = ubq->dev;
2252
int ret = 0;
2253
2254
/*
2255
* When handling FETCH command for setting up ublk uring queue,
2256
* ub->mutex is the innermost lock, and we won't block for handling
2257
* FETCH, so it is fine even for IO_URING_F_NONBLOCK.
2258
*/
2259
mutex_lock(&ub->mutex);
2260
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
2261
if (ublk_queue_ready(ubq)) {
2262
ret = -EBUSY;
2263
goto out;
2264
}
2265
2266
/* allow each command to be FETCHed at most once */
2267
if (io->flags & UBLK_IO_FLAG_ACTIVE) {
2268
ret = -EINVAL;
2269
goto out;
2270
}
2271
2272
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
2273
2274
ublk_fill_io_cmd(io, cmd);
2275
ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL);
2276
if (ret)
2277
goto out;
2278
2279
WRITE_ONCE(io->task, get_task_struct(current));
2280
ublk_mark_io_ready(ub, ubq);
2281
out:
2282
mutex_unlock(&ub->mutex);
2283
return ret;
2284
}
2285
2286
static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
2287
struct ublk_io *io, __u64 buf_addr)
2288
{
2289
struct request *req = io->req;
2290
2291
if (ublk_need_map_io(ubq)) {
2292
/*
2293
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
2294
* NEED GET DATA is not enabled or it is Read IO.
2295
*/
2296
if (!buf_addr && (!ublk_need_get_data(ubq) ||
2297
req_op(req) == REQ_OP_READ))
2298
return -EINVAL;
2299
} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
2300
/*
2301
* User copy requires addr to be unset when command is
2302
* not zone append
2303
*/
2304
return -EINVAL;
2305
}
2306
2307
return 0;
2308
}
2309
2310
static bool ublk_need_complete_req(const struct ublk_queue *ubq,
2311
struct ublk_io *io)
2312
{
2313
if (ublk_need_req_ref(ubq))
2314
return ublk_sub_req_ref(io);
2315
return true;
2316
}
2317
2318
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
2319
struct request *req)
2320
{
2321
/*
2322
* We have handled UBLK_IO_NEED_GET_DATA command,
2323
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
2324
* do the copy work.
2325
*/
2326
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
2327
/* update iod->addr because ublksrv may have passed a new io buffer */
2328
ublk_get_iod(ubq, req->tag)->addr = io->addr;
2329
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
2330
__func__, ubq->q_id, req->tag, io->flags,
2331
ublk_get_iod(ubq, req->tag)->addr);
2332
2333
return ublk_start_io(ubq, req, io);
2334
}
2335
2336
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
2337
unsigned int issue_flags,
2338
const struct ublksrv_io_cmd *ub_cmd)
2339
{
2340
u16 buf_idx = UBLK_INVALID_BUF_IDX;
2341
struct ublk_device *ub = cmd->file->private_data;
2342
struct ublk_queue *ubq;
2343
struct ublk_io *io;
2344
u32 cmd_op = cmd->cmd_op;
2345
unsigned tag = ub_cmd->tag;
2346
struct request *req;
2347
int ret;
2348
bool compl;
2349
2350
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
2351
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
2352
ub_cmd->result);
2353
2354
ret = ublk_check_cmd_op(cmd_op);
2355
if (ret)
2356
goto out;
2357
2358
/*
2359
* io_buffer_unregister_bvec() doesn't access the ubq or io,
2360
* so no need to validate the q_id, tag, or task
2361
*/
2362
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
2363
return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
2364
issue_flags);
2365
2366
ret = -EINVAL;
2367
if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
2368
goto out;
2369
2370
ubq = ublk_get_queue(ub, ub_cmd->q_id);
2371
2372
if (tag >= ubq->q_depth)
2373
goto out;
2374
2375
io = &ubq->ios[tag];
2376
/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
2377
if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
2378
ret = ublk_check_fetch_buf(ubq, ub_cmd->addr);
2379
if (ret)
2380
goto out;
2381
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
2382
if (ret)
2383
goto out;
2384
2385
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2386
return -EIOCBQUEUED;
2387
}
2388
2389
if (READ_ONCE(io->task) != current) {
2390
/*
2391
* ublk_register_io_buf() accesses only the io's refcount,
2392
* so can be handled on any task
2393
*/
2394
if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
2395
return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
2396
issue_flags);
2397
2398
goto out;
2399
}
2400
2401
/* there is pending io cmd, something must be wrong */
2402
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
2403
ret = -EBUSY;
2404
goto out;
2405
}
2406
2407
/*
2408
* ensure that the user issues UBLK_IO_NEED_GET_DATA
2409
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
2410
*/
2411
if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
2412
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
2413
goto out;
2414
2415
switch (_IOC_NR(cmd_op)) {
2416
case UBLK_IO_REGISTER_IO_BUF:
2417
return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
2418
issue_flags);
2419
case UBLK_IO_COMMIT_AND_FETCH_REQ:
2420
ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr);
2421
if (ret)
2422
goto out;
2423
io->res = ub_cmd->result;
2424
req = ublk_fill_io_cmd(io, cmd);
2425
ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx);
2426
compl = ublk_need_complete_req(ubq, io);
2427
2428
/* can't touch 'ublk_io' any more */
2429
if (buf_idx != UBLK_INVALID_BUF_IDX)
2430
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
2431
if (req_op(req) == REQ_OP_ZONE_APPEND)
2432
req->__sector = ub_cmd->zone_append_lba;
2433
if (compl)
2434
__ublk_complete_rq(req);
2435
2436
if (ret)
2437
goto out;
2438
break;
2439
case UBLK_IO_NEED_GET_DATA:
2440
/*
2441
* ublk_get_data() may fail and fallback to requeue, so keep
2442
* uring_cmd active first and prepare for handling new requeued
2443
* request
2444
*/
2445
req = ublk_fill_io_cmd(io, cmd);
2446
ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
2447
WARN_ON_ONCE(ret);
2448
if (likely(ublk_get_data(ubq, io, req))) {
2449
__ublk_prep_compl_io_cmd(io, req);
2450
return UBLK_IO_RES_OK;
2451
}
2452
break;
2453
default:
2454
goto out;
2455
}
2456
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2457
return -EIOCBQUEUED;
2458
2459
out:
2460
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
2461
__func__, cmd_op, tag, ret, io->flags);
2462
return ret;
2463
}
2464
2465
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
2466
const struct ublk_queue *ubq, struct ublk_io *io, size_t offset)
2467
{
2468
unsigned tag = io - ubq->ios;
2469
struct request *req;
2470
2471
/*
2472
* can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
2473
* which would overwrite it with io->cmd
2474
*/
2475
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2476
if (!req)
2477
return NULL;
2478
2479
if (!ublk_get_req_ref(io))
2480
return NULL;
2481
2482
if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
2483
goto fail_put;
2484
2485
if (!ublk_rq_has_data(req))
2486
goto fail_put;
2487
2488
if (offset > blk_rq_bytes(req))
2489
goto fail_put;
2490
2491
return req;
2492
fail_put:
2493
ublk_put_req_ref(io, req);
2494
return NULL;
2495
}
2496
2497
static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
2498
unsigned int issue_flags)
2499
{
2500
/*
2501
* Not necessary for async retry, but let's keep it simple and always
2502
* copy the values to avoid any potential reuse.
2503
*/
2504
const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
2505
const struct ublksrv_io_cmd ub_cmd = {
2506
.q_id = READ_ONCE(ub_src->q_id),
2507
.tag = READ_ONCE(ub_src->tag),
2508
.result = READ_ONCE(ub_src->result),
2509
.addr = READ_ONCE(ub_src->addr)
2510
};
2511
2512
WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
2513
2514
return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
2515
}
2516
2517
static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
2518
unsigned int issue_flags)
2519
{
2520
int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
2521
2522
if (ret != -EIOCBQUEUED)
2523
io_uring_cmd_done(cmd, ret, 0, issue_flags);
2524
}
2525
2526
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
2527
{
2528
if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
2529
ublk_uring_cmd_cancel_fn(cmd, issue_flags);
2530
return 0;
2531
}
2532
2533
/* well-implemented server won't run into unlocked */
2534
if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
2535
io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
2536
return -EIOCBQUEUED;
2537
}
2538
2539
return ublk_ch_uring_cmd_local(cmd, issue_flags);
2540
}
2541
2542
static inline bool ublk_check_ubuf_dir(const struct request *req,
2543
int ubuf_dir)
2544
{
2545
/* copy ubuf to request pages */
2546
if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
2547
ubuf_dir == ITER_SOURCE)
2548
return true;
2549
2550
/* copy request pages to ubuf */
2551
if ((req_op(req) == REQ_OP_WRITE ||
2552
req_op(req) == REQ_OP_ZONE_APPEND) &&
2553
ubuf_dir == ITER_DEST)
2554
return true;
2555
2556
return false;
2557
}
2558
2559
static struct request *ublk_check_and_get_req(struct kiocb *iocb,
2560
struct iov_iter *iter, size_t *off, int dir,
2561
struct ublk_io **io)
2562
{
2563
struct ublk_device *ub = iocb->ki_filp->private_data;
2564
struct ublk_queue *ubq;
2565
struct request *req;
2566
size_t buf_off;
2567
u16 tag, q_id;
2568
2569
if (!ub)
2570
return ERR_PTR(-EACCES);
2571
2572
if (!user_backed_iter(iter))
2573
return ERR_PTR(-EACCES);
2574
2575
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2576
return ERR_PTR(-EACCES);
2577
2578
tag = ublk_pos_to_tag(iocb->ki_pos);
2579
q_id = ublk_pos_to_hwq(iocb->ki_pos);
2580
buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
2581
2582
if (q_id >= ub->dev_info.nr_hw_queues)
2583
return ERR_PTR(-EINVAL);
2584
2585
ubq = ublk_get_queue(ub, q_id);
2586
if (!ubq)
2587
return ERR_PTR(-EINVAL);
2588
2589
if (!ublk_support_user_copy(ubq))
2590
return ERR_PTR(-EACCES);
2591
2592
if (tag >= ubq->q_depth)
2593
return ERR_PTR(-EINVAL);
2594
2595
*io = &ubq->ios[tag];
2596
req = __ublk_check_and_get_req(ub, ubq, *io, buf_off);
2597
if (!req)
2598
return ERR_PTR(-EINVAL);
2599
2600
if (!req->mq_hctx || !req->mq_hctx->driver_data)
2601
goto fail;
2602
2603
if (!ublk_check_ubuf_dir(req, dir))
2604
goto fail;
2605
2606
*off = buf_off;
2607
return req;
2608
fail:
2609
ublk_put_req_ref(*io, req);
2610
return ERR_PTR(-EACCES);
2611
}
2612
2613
static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
2614
{
2615
struct request *req;
2616
struct ublk_io *io;
2617
size_t buf_off;
2618
size_t ret;
2619
2620
req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io);
2621
if (IS_ERR(req))
2622
return PTR_ERR(req);
2623
2624
ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
2625
ublk_put_req_ref(io, req);
2626
2627
return ret;
2628
}
2629
2630
static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
2631
{
2632
struct request *req;
2633
struct ublk_io *io;
2634
size_t buf_off;
2635
size_t ret;
2636
2637
req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io);
2638
if (IS_ERR(req))
2639
return PTR_ERR(req);
2640
2641
ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
2642
ublk_put_req_ref(io, req);
2643
2644
return ret;
2645
}
2646
2647
static const struct file_operations ublk_ch_fops = {
2648
.owner = THIS_MODULE,
2649
.open = ublk_ch_open,
2650
.release = ublk_ch_release,
2651
.read_iter = ublk_ch_read_iter,
2652
.write_iter = ublk_ch_write_iter,
2653
.uring_cmd = ublk_ch_uring_cmd,
2654
.mmap = ublk_ch_mmap,
2655
};
2656
2657
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
2658
{
2659
int size = ublk_queue_cmd_buf_size(ub, q_id);
2660
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2661
int i;
2662
2663
for (i = 0; i < ubq->q_depth; i++) {
2664
struct ublk_io *io = &ubq->ios[i];
2665
if (io->task)
2666
put_task_struct(io->task);
2667
WARN_ON_ONCE(refcount_read(&io->ref));
2668
WARN_ON_ONCE(io->task_registered_buffers);
2669
}
2670
2671
if (ubq->io_cmd_buf)
2672
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
2673
}
2674
2675
static int ublk_init_queue(struct ublk_device *ub, int q_id)
2676
{
2677
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2678
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
2679
void *ptr;
2680
int size;
2681
2682
spin_lock_init(&ubq->cancel_lock);
2683
ubq->flags = ub->dev_info.flags;
2684
ubq->q_id = q_id;
2685
ubq->q_depth = ub->dev_info.queue_depth;
2686
size = ublk_queue_cmd_buf_size(ub, q_id);
2687
2688
ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
2689
if (!ptr)
2690
return -ENOMEM;
2691
2692
ubq->io_cmd_buf = ptr;
2693
ubq->dev = ub;
2694
return 0;
2695
}
2696
2697
static void ublk_deinit_queues(struct ublk_device *ub)
2698
{
2699
int nr_queues = ub->dev_info.nr_hw_queues;
2700
int i;
2701
2702
if (!ub->__queues)
2703
return;
2704
2705
for (i = 0; i < nr_queues; i++)
2706
ublk_deinit_queue(ub, i);
2707
kvfree(ub->__queues);
2708
}
2709
2710
static int ublk_init_queues(struct ublk_device *ub)
2711
{
2712
int nr_queues = ub->dev_info.nr_hw_queues;
2713
int depth = ub->dev_info.queue_depth;
2714
int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
2715
int i, ret = -ENOMEM;
2716
2717
ub->queue_size = ubq_size;
2718
ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
2719
if (!ub->__queues)
2720
return ret;
2721
2722
for (i = 0; i < nr_queues; i++) {
2723
if (ublk_init_queue(ub, i))
2724
goto fail;
2725
}
2726
2727
init_completion(&ub->completion);
2728
return 0;
2729
2730
fail:
2731
ublk_deinit_queues(ub);
2732
return ret;
2733
}
2734
2735
static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
2736
{
2737
int i = idx;
2738
int err;
2739
2740
spin_lock(&ublk_idr_lock);
2741
/* allocate id, if @id >= 0, we're requesting that specific id */
2742
if (i >= 0) {
2743
err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
2744
if (err == -ENOSPC)
2745
err = -EEXIST;
2746
} else {
2747
err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
2748
GFP_NOWAIT);
2749
}
2750
spin_unlock(&ublk_idr_lock);
2751
2752
if (err >= 0)
2753
ub->ub_number = err;
2754
2755
return err;
2756
}
2757
2758
static void ublk_free_dev_number(struct ublk_device *ub)
2759
{
2760
spin_lock(&ublk_idr_lock);
2761
idr_remove(&ublk_index_idr, ub->ub_number);
2762
wake_up_all(&ublk_idr_wq);
2763
spin_unlock(&ublk_idr_lock);
2764
}
2765
2766
static void ublk_cdev_rel(struct device *dev)
2767
{
2768
struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
2769
2770
blk_mq_free_tag_set(&ub->tag_set);
2771
ublk_deinit_queues(ub);
2772
ublk_free_dev_number(ub);
2773
mutex_destroy(&ub->mutex);
2774
mutex_destroy(&ub->cancel_mutex);
2775
kfree(ub);
2776
}
2777
2778
static int ublk_add_chdev(struct ublk_device *ub)
2779
{
2780
struct device *dev = &ub->cdev_dev;
2781
int minor = ub->ub_number;
2782
int ret;
2783
2784
dev->parent = ublk_misc.this_device;
2785
dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
2786
dev->class = &ublk_chr_class;
2787
dev->release = ublk_cdev_rel;
2788
device_initialize(dev);
2789
2790
ret = dev_set_name(dev, "ublkc%d", minor);
2791
if (ret)
2792
goto fail;
2793
2794
cdev_init(&ub->cdev, &ublk_ch_fops);
2795
ret = cdev_device_add(&ub->cdev, dev);
2796
if (ret)
2797
goto fail;
2798
2799
if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
2800
unprivileged_ublks_added++;
2801
return 0;
2802
fail:
2803
put_device(dev);
2804
return ret;
2805
}
2806
2807
/* align max io buffer size with PAGE_SIZE */
2808
static void ublk_align_max_io_size(struct ublk_device *ub)
2809
{
2810
unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
2811
2812
ub->dev_info.max_io_buf_bytes =
2813
round_down(max_io_bytes, PAGE_SIZE);
2814
}
2815
2816
static int ublk_add_tag_set(struct ublk_device *ub)
2817
{
2818
ub->tag_set.ops = &ublk_mq_ops;
2819
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
2820
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
2821
ub->tag_set.numa_node = NUMA_NO_NODE;
2822
ub->tag_set.driver_data = ub;
2823
return blk_mq_alloc_tag_set(&ub->tag_set);
2824
}
2825
2826
static void ublk_remove(struct ublk_device *ub)
2827
{
2828
bool unprivileged;
2829
2830
ublk_stop_dev(ub);
2831
cdev_device_del(&ub->cdev, &ub->cdev_dev);
2832
unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
2833
ublk_put_device(ub);
2834
2835
if (unprivileged)
2836
unprivileged_ublks_added--;
2837
}
2838
2839
static struct ublk_device *ublk_get_device_from_id(int idx)
2840
{
2841
struct ublk_device *ub = NULL;
2842
2843
if (idx < 0)
2844
return NULL;
2845
2846
spin_lock(&ublk_idr_lock);
2847
ub = idr_find(&ublk_index_idr, idx);
2848
if (ub)
2849
ub = ublk_get_device(ub);
2850
spin_unlock(&ublk_idr_lock);
2851
2852
return ub;
2853
}
2854
2855
static int ublk_ctrl_start_dev(struct ublk_device *ub,
2856
const struct ublksrv_ctrl_cmd *header)
2857
{
2858
const struct ublk_param_basic *p = &ub->params.basic;
2859
int ublksrv_pid = (int)header->data[0];
2860
struct queue_limits lim = {
2861
.logical_block_size = 1 << p->logical_bs_shift,
2862
.physical_block_size = 1 << p->physical_bs_shift,
2863
.io_min = 1 << p->io_min_shift,
2864
.io_opt = 1 << p->io_opt_shift,
2865
.max_hw_sectors = p->max_sectors,
2866
.chunk_sectors = p->chunk_sectors,
2867
.virt_boundary_mask = p->virt_boundary_mask,
2868
.max_segments = USHRT_MAX,
2869
.max_segment_size = UINT_MAX,
2870
.dma_alignment = 3,
2871
};
2872
struct gendisk *disk;
2873
int ret = -EINVAL;
2874
2875
if (ublksrv_pid <= 0)
2876
return -EINVAL;
2877
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
2878
return -EINVAL;
2879
2880
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
2881
const struct ublk_param_discard *pd = &ub->params.discard;
2882
2883
lim.discard_alignment = pd->discard_alignment;
2884
lim.discard_granularity = pd->discard_granularity;
2885
lim.max_hw_discard_sectors = pd->max_discard_sectors;
2886
lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
2887
lim.max_discard_segments = pd->max_discard_segments;
2888
}
2889
2890
if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
2891
const struct ublk_param_zoned *p = &ub->params.zoned;
2892
2893
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
2894
return -EOPNOTSUPP;
2895
2896
lim.features |= BLK_FEAT_ZONED;
2897
lim.max_active_zones = p->max_active_zones;
2898
lim.max_open_zones = p->max_open_zones;
2899
lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
2900
}
2901
2902
if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
2903
lim.features |= BLK_FEAT_WRITE_CACHE;
2904
if (ub->params.basic.attrs & UBLK_ATTR_FUA)
2905
lim.features |= BLK_FEAT_FUA;
2906
}
2907
2908
if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
2909
lim.features |= BLK_FEAT_ROTATIONAL;
2910
2911
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
2912
lim.dma_alignment = ub->params.dma.alignment;
2913
2914
if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
2915
lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
2916
lim.max_segment_size = ub->params.seg.max_segment_size;
2917
lim.max_segments = ub->params.seg.max_segments;
2918
}
2919
2920
if (wait_for_completion_interruptible(&ub->completion) != 0)
2921
return -EINTR;
2922
2923
if (ub->ublksrv_tgid != ublksrv_pid)
2924
return -EINVAL;
2925
2926
mutex_lock(&ub->mutex);
2927
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
2928
test_bit(UB_STATE_USED, &ub->state)) {
2929
ret = -EEXIST;
2930
goto out_unlock;
2931
}
2932
2933
disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
2934
if (IS_ERR(disk)) {
2935
ret = PTR_ERR(disk);
2936
goto out_unlock;
2937
}
2938
sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
2939
disk->fops = &ub_fops;
2940
disk->private_data = ub;
2941
2942
ub->dev_info.ublksrv_pid = ublksrv_pid;
2943
ub->ub_disk = disk;
2944
2945
ublk_apply_params(ub);
2946
2947
/* don't probe partitions if any daemon task is un-trusted */
2948
if (ub->unprivileged_daemons)
2949
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
2950
2951
ublk_get_device(ub);
2952
ub->dev_info.state = UBLK_S_DEV_LIVE;
2953
2954
if (ublk_dev_is_zoned(ub)) {
2955
ret = ublk_revalidate_disk_zones(ub);
2956
if (ret)
2957
goto out_put_cdev;
2958
}
2959
2960
ret = add_disk(disk);
2961
if (ret)
2962
goto out_put_cdev;
2963
2964
set_bit(UB_STATE_USED, &ub->state);
2965
2966
out_put_cdev:
2967
if (ret) {
2968
ublk_detach_disk(ub);
2969
ublk_put_device(ub);
2970
}
2971
if (ret)
2972
put_disk(disk);
2973
out_unlock:
2974
mutex_unlock(&ub->mutex);
2975
return ret;
2976
}
2977
2978
static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
2979
const struct ublksrv_ctrl_cmd *header)
2980
{
2981
void __user *argp = (void __user *)(unsigned long)header->addr;
2982
cpumask_var_t cpumask;
2983
unsigned long queue;
2984
unsigned int retlen;
2985
unsigned int i;
2986
int ret;
2987
2988
if (header->len * BITS_PER_BYTE < nr_cpu_ids)
2989
return -EINVAL;
2990
if (header->len & (sizeof(unsigned long)-1))
2991
return -EINVAL;
2992
if (!header->addr)
2993
return -EINVAL;
2994
2995
queue = header->data[0];
2996
if (queue >= ub->dev_info.nr_hw_queues)
2997
return -EINVAL;
2998
2999
if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
3000
return -ENOMEM;
3001
3002
for_each_possible_cpu(i) {
3003
if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
3004
cpumask_set_cpu(i, cpumask);
3005
}
3006
3007
ret = -EFAULT;
3008
retlen = min_t(unsigned short, header->len, cpumask_size());
3009
if (copy_to_user(argp, cpumask, retlen))
3010
goto out_free_cpumask;
3011
if (retlen != header->len &&
3012
clear_user(argp + retlen, header->len - retlen))
3013
goto out_free_cpumask;
3014
3015
ret = 0;
3016
out_free_cpumask:
3017
free_cpumask_var(cpumask);
3018
return ret;
3019
}
3020
3021
static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
3022
{
3023
pr_devel("%s: dev id %d flags %llx\n", __func__,
3024
info->dev_id, info->flags);
3025
pr_devel("\t nr_hw_queues %d queue_depth %d\n",
3026
info->nr_hw_queues, info->queue_depth);
3027
}
3028
3029
static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
3030
{
3031
void __user *argp = (void __user *)(unsigned long)header->addr;
3032
struct ublksrv_ctrl_dev_info info;
3033
struct ublk_device *ub;
3034
int ret = -EINVAL;
3035
3036
if (header->len < sizeof(info) || !header->addr)
3037
return -EINVAL;
3038
if (header->queue_id != (u16)-1) {
3039
pr_warn("%s: queue_id is wrong %x\n",
3040
__func__, header->queue_id);
3041
return -EINVAL;
3042
}
3043
3044
if (copy_from_user(&info, argp, sizeof(info)))
3045
return -EFAULT;
3046
3047
if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
3048
info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
3049
return -EINVAL;
3050
3051
if (capable(CAP_SYS_ADMIN))
3052
info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
3053
else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
3054
return -EPERM;
3055
3056
/* forbid nonsense combinations of recovery flags */
3057
switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
3058
case 0:
3059
case UBLK_F_USER_RECOVERY:
3060
case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
3061
case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
3062
break;
3063
default:
3064
pr_warn("%s: invalid recovery flags %llx\n", __func__,
3065
info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
3066
return -EINVAL;
3067
}
3068
3069
if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
3070
pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
3071
return -EINVAL;
3072
}
3073
3074
/*
3075
* unprivileged device can't be trusted, but RECOVERY and
3076
* RECOVERY_REISSUE still may hang error handling, so can't
3077
* support recovery features for unprivileged ublk now
3078
*
3079
* TODO: provide forward progress for RECOVERY handler, so that
3080
* unprivileged device can benefit from it
3081
*/
3082
if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
3083
info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
3084
UBLK_F_USER_RECOVERY);
3085
3086
/*
3087
* For USER_COPY, we depends on userspace to fill request
3088
* buffer by pwrite() to ublk char device, which can't be
3089
* used for unprivileged device
3090
*
3091
* Same with zero copy or auto buffer register.
3092
*/
3093
if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3094
UBLK_F_AUTO_BUF_REG))
3095
return -EINVAL;
3096
}
3097
3098
/* the created device is always owned by current user */
3099
ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
3100
3101
if (header->dev_id != info.dev_id) {
3102
pr_warn("%s: dev id not match %u %u\n",
3103
__func__, header->dev_id, info.dev_id);
3104
return -EINVAL;
3105
}
3106
3107
if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
3108
pr_warn("%s: dev id is too large. Max supported is %d\n",
3109
__func__, UBLK_MAX_UBLKS - 1);
3110
return -EINVAL;
3111
}
3112
3113
ublk_dump_dev_info(&info);
3114
3115
ret = mutex_lock_killable(&ublk_ctl_mutex);
3116
if (ret)
3117
return ret;
3118
3119
ret = -EACCES;
3120
if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
3121
unprivileged_ublks_added >= unprivileged_ublks_max)
3122
goto out_unlock;
3123
3124
ret = -ENOMEM;
3125
ub = kzalloc(sizeof(*ub), GFP_KERNEL);
3126
if (!ub)
3127
goto out_unlock;
3128
mutex_init(&ub->mutex);
3129
spin_lock_init(&ub->lock);
3130
mutex_init(&ub->cancel_mutex);
3131
3132
ret = ublk_alloc_dev_number(ub, header->dev_id);
3133
if (ret < 0)
3134
goto out_free_ub;
3135
3136
memcpy(&ub->dev_info, &info, sizeof(info));
3137
3138
/* update device id */
3139
ub->dev_info.dev_id = ub->ub_number;
3140
3141
/*
3142
* 64bit flags will be copied back to userspace as feature
3143
* negotiation result, so have to clear flags which driver
3144
* doesn't support yet, then userspace can get correct flags
3145
* (features) to handle.
3146
*/
3147
ub->dev_info.flags &= UBLK_F_ALL;
3148
3149
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
3150
UBLK_F_URING_CMD_COMP_IN_TASK |
3151
UBLK_F_PER_IO_DAEMON |
3152
UBLK_F_BUF_REG_OFF_DAEMON;
3153
3154
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
3155
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3156
UBLK_F_AUTO_BUF_REG))
3157
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
3158
3159
/*
3160
* Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
3161
* returning write_append_lba, which is only allowed in case of
3162
* user copy or zero copy
3163
*/
3164
if (ublk_dev_is_zoned(ub) &&
3165
(!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
3166
(UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
3167
ret = -EINVAL;
3168
goto out_free_dev_number;
3169
}
3170
3171
ub->dev_info.nr_hw_queues = min_t(unsigned int,
3172
ub->dev_info.nr_hw_queues, nr_cpu_ids);
3173
ublk_align_max_io_size(ub);
3174
3175
ret = ublk_init_queues(ub);
3176
if (ret)
3177
goto out_free_dev_number;
3178
3179
ret = ublk_add_tag_set(ub);
3180
if (ret)
3181
goto out_deinit_queues;
3182
3183
ret = -EFAULT;
3184
if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
3185
goto out_free_tag_set;
3186
3187
/*
3188
* Add the char dev so that ublksrv daemon can be setup.
3189
* ublk_add_chdev() will cleanup everything if it fails.
3190
*/
3191
ret = ublk_add_chdev(ub);
3192
goto out_unlock;
3193
3194
out_free_tag_set:
3195
blk_mq_free_tag_set(&ub->tag_set);
3196
out_deinit_queues:
3197
ublk_deinit_queues(ub);
3198
out_free_dev_number:
3199
ublk_free_dev_number(ub);
3200
out_free_ub:
3201
mutex_destroy(&ub->mutex);
3202
mutex_destroy(&ub->cancel_mutex);
3203
kfree(ub);
3204
out_unlock:
3205
mutex_unlock(&ublk_ctl_mutex);
3206
return ret;
3207
}
3208
3209
static inline bool ublk_idr_freed(int id)
3210
{
3211
void *ptr;
3212
3213
spin_lock(&ublk_idr_lock);
3214
ptr = idr_find(&ublk_index_idr, id);
3215
spin_unlock(&ublk_idr_lock);
3216
3217
return ptr == NULL;
3218
}
3219
3220
static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
3221
{
3222
struct ublk_device *ub = *p_ub;
3223
int idx = ub->ub_number;
3224
int ret;
3225
3226
ret = mutex_lock_killable(&ublk_ctl_mutex);
3227
if (ret)
3228
return ret;
3229
3230
if (!test_bit(UB_STATE_DELETED, &ub->state)) {
3231
ublk_remove(ub);
3232
set_bit(UB_STATE_DELETED, &ub->state);
3233
}
3234
3235
/* Mark the reference as consumed */
3236
*p_ub = NULL;
3237
ublk_put_device(ub);
3238
mutex_unlock(&ublk_ctl_mutex);
3239
3240
/*
3241
* Wait until the idr is removed, then it can be reused after
3242
* DEL_DEV command is returned.
3243
*
3244
* If we returns because of user interrupt, future delete command
3245
* may come:
3246
*
3247
* - the device number isn't freed, this device won't or needn't
3248
* be deleted again, since UB_STATE_DELETED is set, and device
3249
* will be released after the last reference is dropped
3250
*
3251
* - the device number is freed already, we will not find this
3252
* device via ublk_get_device_from_id()
3253
*/
3254
if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
3255
return -EINTR;
3256
return 0;
3257
}
3258
3259
static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
3260
{
3261
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3262
3263
pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
3264
__func__, cmd->cmd_op, header->dev_id, header->queue_id,
3265
header->data[0], header->addr, header->len);
3266
}
3267
3268
static int ublk_ctrl_stop_dev(struct ublk_device *ub)
3269
{
3270
ublk_stop_dev(ub);
3271
return 0;
3272
}
3273
3274
static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
3275
const struct ublksrv_ctrl_cmd *header)
3276
{
3277
void __user *argp = (void __user *)(unsigned long)header->addr;
3278
3279
if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
3280
return -EINVAL;
3281
3282
if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
3283
return -EFAULT;
3284
3285
return 0;
3286
}
3287
3288
/* TYPE_DEVT is readonly, so fill it up before returning to userspace */
3289
static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
3290
{
3291
ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
3292
ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
3293
3294
if (ub->ub_disk) {
3295
ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
3296
ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
3297
} else {
3298
ub->params.devt.disk_major = 0;
3299
ub->params.devt.disk_minor = 0;
3300
}
3301
ub->params.types |= UBLK_PARAM_TYPE_DEVT;
3302
}
3303
3304
static int ublk_ctrl_get_params(struct ublk_device *ub,
3305
const struct ublksrv_ctrl_cmd *header)
3306
{
3307
void __user *argp = (void __user *)(unsigned long)header->addr;
3308
struct ublk_params_header ph;
3309
int ret;
3310
3311
if (header->len <= sizeof(ph) || !header->addr)
3312
return -EINVAL;
3313
3314
if (copy_from_user(&ph, argp, sizeof(ph)))
3315
return -EFAULT;
3316
3317
if (ph.len > header->len || !ph.len)
3318
return -EINVAL;
3319
3320
if (ph.len > sizeof(struct ublk_params))
3321
ph.len = sizeof(struct ublk_params);
3322
3323
mutex_lock(&ub->mutex);
3324
ublk_ctrl_fill_params_devt(ub);
3325
if (copy_to_user(argp, &ub->params, ph.len))
3326
ret = -EFAULT;
3327
else
3328
ret = 0;
3329
mutex_unlock(&ub->mutex);
3330
3331
return ret;
3332
}
3333
3334
static int ublk_ctrl_set_params(struct ublk_device *ub,
3335
const struct ublksrv_ctrl_cmd *header)
3336
{
3337
void __user *argp = (void __user *)(unsigned long)header->addr;
3338
struct ublk_params_header ph;
3339
int ret = -EFAULT;
3340
3341
if (header->len <= sizeof(ph) || !header->addr)
3342
return -EINVAL;
3343
3344
if (copy_from_user(&ph, argp, sizeof(ph)))
3345
return -EFAULT;
3346
3347
if (ph.len > header->len || !ph.len || !ph.types)
3348
return -EINVAL;
3349
3350
if (ph.len > sizeof(struct ublk_params))
3351
ph.len = sizeof(struct ublk_params);
3352
3353
mutex_lock(&ub->mutex);
3354
if (test_bit(UB_STATE_USED, &ub->state)) {
3355
/*
3356
* Parameters can only be changed when device hasn't
3357
* been started yet
3358
*/
3359
ret = -EACCES;
3360
} else if (copy_from_user(&ub->params, argp, ph.len)) {
3361
ret = -EFAULT;
3362
} else {
3363
/* clear all we don't support yet */
3364
ub->params.types &= UBLK_PARAM_TYPE_ALL;
3365
ret = ublk_validate_params(ub);
3366
if (ret)
3367
ub->params.types = 0;
3368
}
3369
mutex_unlock(&ub->mutex);
3370
3371
return ret;
3372
}
3373
3374
static int ublk_ctrl_start_recovery(struct ublk_device *ub,
3375
const struct ublksrv_ctrl_cmd *header)
3376
{
3377
int ret = -EINVAL;
3378
3379
mutex_lock(&ub->mutex);
3380
if (ublk_nosrv_should_stop_dev(ub))
3381
goto out_unlock;
3382
/*
3383
* START_RECOVERY is only allowd after:
3384
*
3385
* (1) UB_STATE_OPEN is not set, which means the dying process is exited
3386
* and related io_uring ctx is freed so file struct of /dev/ublkcX is
3387
* released.
3388
*
3389
* and one of the following holds
3390
*
3391
* (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
3392
* (a)has quiesced request queue
3393
* (b)has requeued every inflight rqs whose io_flags is ACTIVE
3394
* (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
3395
* (d)has completed/camceled all ioucmds owned by ther dying process
3396
*
3397
* (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
3398
* quiesced, but all I/O is being immediately errored
3399
*/
3400
if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
3401
ret = -EBUSY;
3402
goto out_unlock;
3403
}
3404
pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
3405
init_completion(&ub->completion);
3406
ret = 0;
3407
out_unlock:
3408
mutex_unlock(&ub->mutex);
3409
return ret;
3410
}
3411
3412
static int ublk_ctrl_end_recovery(struct ublk_device *ub,
3413
const struct ublksrv_ctrl_cmd *header)
3414
{
3415
int ublksrv_pid = (int)header->data[0];
3416
int ret = -EINVAL;
3417
3418
pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
3419
header->dev_id);
3420
3421
if (wait_for_completion_interruptible(&ub->completion))
3422
return -EINTR;
3423
3424
pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
3425
header->dev_id);
3426
3427
if (ub->ublksrv_tgid != ublksrv_pid)
3428
return -EINVAL;
3429
3430
mutex_lock(&ub->mutex);
3431
if (ublk_nosrv_should_stop_dev(ub))
3432
goto out_unlock;
3433
3434
if (!ublk_dev_in_recoverable_state(ub)) {
3435
ret = -EBUSY;
3436
goto out_unlock;
3437
}
3438
ub->dev_info.ublksrv_pid = ublksrv_pid;
3439
ub->dev_info.state = UBLK_S_DEV_LIVE;
3440
pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
3441
__func__, ublksrv_pid, header->dev_id);
3442
blk_mq_kick_requeue_list(ub->ub_disk->queue);
3443
ret = 0;
3444
out_unlock:
3445
mutex_unlock(&ub->mutex);
3446
return ret;
3447
}
3448
3449
static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
3450
{
3451
void __user *argp = (void __user *)(unsigned long)header->addr;
3452
u64 features = UBLK_F_ALL;
3453
3454
if (header->len != UBLK_FEATURES_LEN || !header->addr)
3455
return -EINVAL;
3456
3457
if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
3458
return -EFAULT;
3459
3460
return 0;
3461
}
3462
3463
static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
3464
{
3465
struct ublk_param_basic *p = &ub->params.basic;
3466
u64 new_size = header->data[0];
3467
3468
mutex_lock(&ub->mutex);
3469
p->dev_sectors = new_size;
3470
set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
3471
mutex_unlock(&ub->mutex);
3472
}
3473
3474
struct count_busy {
3475
const struct ublk_queue *ubq;
3476
unsigned int nr_busy;
3477
};
3478
3479
static bool ublk_count_busy_req(struct request *rq, void *data)
3480
{
3481
struct count_busy *idle = data;
3482
3483
if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
3484
idle->nr_busy += 1;
3485
return true;
3486
}
3487
3488
/* uring_cmd is guaranteed to be active if the associated request is idle */
3489
static bool ubq_has_idle_io(const struct ublk_queue *ubq)
3490
{
3491
struct count_busy data = {
3492
.ubq = ubq,
3493
};
3494
3495
blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
3496
return data.nr_busy < ubq->q_depth;
3497
}
3498
3499
/* Wait until each hw queue has at least one idle IO */
3500
static int ublk_wait_for_idle_io(struct ublk_device *ub,
3501
unsigned int timeout_ms)
3502
{
3503
unsigned int elapsed = 0;
3504
int ret;
3505
3506
while (elapsed < timeout_ms && !signal_pending(current)) {
3507
unsigned int queues_cancelable = 0;
3508
int i;
3509
3510
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
3511
struct ublk_queue *ubq = ublk_get_queue(ub, i);
3512
3513
queues_cancelable += !!ubq_has_idle_io(ubq);
3514
}
3515
3516
/*
3517
* Each queue needs at least one active command for
3518
* notifying ublk server
3519
*/
3520
if (queues_cancelable == ub->dev_info.nr_hw_queues)
3521
break;
3522
3523
msleep(UBLK_REQUEUE_DELAY_MS);
3524
elapsed += UBLK_REQUEUE_DELAY_MS;
3525
}
3526
3527
if (signal_pending(current))
3528
ret = -EINTR;
3529
else if (elapsed >= timeout_ms)
3530
ret = -EBUSY;
3531
else
3532
ret = 0;
3533
3534
return ret;
3535
}
3536
3537
static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
3538
const struct ublksrv_ctrl_cmd *header)
3539
{
3540
/* zero means wait forever */
3541
u64 timeout_ms = header->data[0];
3542
struct gendisk *disk;
3543
int ret = -ENODEV;
3544
3545
if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
3546
return -EOPNOTSUPP;
3547
3548
mutex_lock(&ub->mutex);
3549
disk = ublk_get_disk(ub);
3550
if (!disk)
3551
goto unlock;
3552
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3553
goto put_disk;
3554
3555
ret = 0;
3556
/* already in expected state */
3557
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
3558
goto put_disk;
3559
3560
/* Mark the device as canceling */
3561
mutex_lock(&ub->cancel_mutex);
3562
blk_mq_quiesce_queue(disk->queue);
3563
ublk_set_canceling(ub, true);
3564
blk_mq_unquiesce_queue(disk->queue);
3565
mutex_unlock(&ub->cancel_mutex);
3566
3567
if (!timeout_ms)
3568
timeout_ms = UINT_MAX;
3569
ret = ublk_wait_for_idle_io(ub, timeout_ms);
3570
3571
put_disk:
3572
ublk_put_disk(disk);
3573
unlock:
3574
mutex_unlock(&ub->mutex);
3575
3576
/* Cancel pending uring_cmd */
3577
if (!ret)
3578
ublk_cancel_dev(ub);
3579
return ret;
3580
}
3581
3582
/*
3583
* All control commands are sent via /dev/ublk-control, so we have to check
3584
* the destination device's permission
3585
*/
3586
static int ublk_char_dev_permission(struct ublk_device *ub,
3587
const char *dev_path, int mask)
3588
{
3589
int err;
3590
struct path path;
3591
struct kstat stat;
3592
3593
err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
3594
if (err)
3595
return err;
3596
3597
err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
3598
if (err)
3599
goto exit;
3600
3601
err = -EPERM;
3602
if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
3603
goto exit;
3604
3605
err = inode_permission(&nop_mnt_idmap,
3606
d_backing_inode(path.dentry), mask);
3607
exit:
3608
path_put(&path);
3609
return err;
3610
}
3611
3612
static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
3613
struct io_uring_cmd *cmd)
3614
{
3615
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
3616
bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
3617
void __user *argp = (void __user *)(unsigned long)header->addr;
3618
char *dev_path = NULL;
3619
int ret = 0;
3620
int mask;
3621
3622
if (!unprivileged) {
3623
if (!capable(CAP_SYS_ADMIN))
3624
return -EPERM;
3625
/*
3626
* The new added command of UBLK_CMD_GET_DEV_INFO2 includes
3627
* char_dev_path in payload too, since userspace may not
3628
* know if the specified device is created as unprivileged
3629
* mode.
3630
*/
3631
if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
3632
return 0;
3633
}
3634
3635
/*
3636
* User has to provide the char device path for unprivileged ublk
3637
*
3638
* header->addr always points to the dev path buffer, and
3639
* header->dev_path_len records length of dev path buffer.
3640
*/
3641
if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
3642
return -EINVAL;
3643
3644
if (header->len < header->dev_path_len)
3645
return -EINVAL;
3646
3647
dev_path = memdup_user_nul(argp, header->dev_path_len);
3648
if (IS_ERR(dev_path))
3649
return PTR_ERR(dev_path);
3650
3651
ret = -EINVAL;
3652
switch (_IOC_NR(cmd->cmd_op)) {
3653
case UBLK_CMD_GET_DEV_INFO:
3654
case UBLK_CMD_GET_DEV_INFO2:
3655
case UBLK_CMD_GET_QUEUE_AFFINITY:
3656
case UBLK_CMD_GET_PARAMS:
3657
case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
3658
mask = MAY_READ;
3659
break;
3660
case UBLK_CMD_START_DEV:
3661
case UBLK_CMD_STOP_DEV:
3662
case UBLK_CMD_ADD_DEV:
3663
case UBLK_CMD_DEL_DEV:
3664
case UBLK_CMD_SET_PARAMS:
3665
case UBLK_CMD_START_USER_RECOVERY:
3666
case UBLK_CMD_END_USER_RECOVERY:
3667
case UBLK_CMD_UPDATE_SIZE:
3668
case UBLK_CMD_QUIESCE_DEV:
3669
mask = MAY_READ | MAY_WRITE;
3670
break;
3671
default:
3672
goto exit;
3673
}
3674
3675
ret = ublk_char_dev_permission(ub, dev_path, mask);
3676
if (!ret) {
3677
header->len -= header->dev_path_len;
3678
header->addr += header->dev_path_len;
3679
}
3680
pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
3681
__func__, ub->ub_number, cmd->cmd_op,
3682
ub->dev_info.owner_uid, ub->dev_info.owner_gid,
3683
dev_path, ret);
3684
exit:
3685
kfree(dev_path);
3686
return ret;
3687
}
3688
3689
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
3690
unsigned int issue_flags)
3691
{
3692
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3693
struct ublk_device *ub = NULL;
3694
u32 cmd_op = cmd->cmd_op;
3695
int ret = -EINVAL;
3696
3697
if (issue_flags & IO_URING_F_NONBLOCK)
3698
return -EAGAIN;
3699
3700
ublk_ctrl_cmd_dump(cmd);
3701
3702
if (!(issue_flags & IO_URING_F_SQE128))
3703
goto out;
3704
3705
ret = ublk_check_cmd_op(cmd_op);
3706
if (ret)
3707
goto out;
3708
3709
if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
3710
ret = ublk_ctrl_get_features(header);
3711
goto out;
3712
}
3713
3714
if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
3715
ret = -ENODEV;
3716
ub = ublk_get_device_from_id(header->dev_id);
3717
if (!ub)
3718
goto out;
3719
3720
ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
3721
if (ret)
3722
goto put_dev;
3723
}
3724
3725
switch (_IOC_NR(cmd_op)) {
3726
case UBLK_CMD_START_DEV:
3727
ret = ublk_ctrl_start_dev(ub, header);
3728
break;
3729
case UBLK_CMD_STOP_DEV:
3730
ret = ublk_ctrl_stop_dev(ub);
3731
break;
3732
case UBLK_CMD_GET_DEV_INFO:
3733
case UBLK_CMD_GET_DEV_INFO2:
3734
ret = ublk_ctrl_get_dev_info(ub, header);
3735
break;
3736
case UBLK_CMD_ADD_DEV:
3737
ret = ublk_ctrl_add_dev(header);
3738
break;
3739
case UBLK_CMD_DEL_DEV:
3740
ret = ublk_ctrl_del_dev(&ub, true);
3741
break;
3742
case UBLK_CMD_DEL_DEV_ASYNC:
3743
ret = ublk_ctrl_del_dev(&ub, false);
3744
break;
3745
case UBLK_CMD_GET_QUEUE_AFFINITY:
3746
ret = ublk_ctrl_get_queue_affinity(ub, header);
3747
break;
3748
case UBLK_CMD_GET_PARAMS:
3749
ret = ublk_ctrl_get_params(ub, header);
3750
break;
3751
case UBLK_CMD_SET_PARAMS:
3752
ret = ublk_ctrl_set_params(ub, header);
3753
break;
3754
case UBLK_CMD_START_USER_RECOVERY:
3755
ret = ublk_ctrl_start_recovery(ub, header);
3756
break;
3757
case UBLK_CMD_END_USER_RECOVERY:
3758
ret = ublk_ctrl_end_recovery(ub, header);
3759
break;
3760
case UBLK_CMD_UPDATE_SIZE:
3761
ublk_ctrl_set_size(ub, header);
3762
ret = 0;
3763
break;
3764
case UBLK_CMD_QUIESCE_DEV:
3765
ret = ublk_ctrl_quiesce_dev(ub, header);
3766
break;
3767
default:
3768
ret = -EOPNOTSUPP;
3769
break;
3770
}
3771
3772
put_dev:
3773
if (ub)
3774
ublk_put_device(ub);
3775
out:
3776
pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
3777
__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
3778
return ret;
3779
}
3780
3781
static const struct file_operations ublk_ctl_fops = {
3782
.open = nonseekable_open,
3783
.uring_cmd = ublk_ctrl_uring_cmd,
3784
.owner = THIS_MODULE,
3785
.llseek = noop_llseek,
3786
};
3787
3788
static struct miscdevice ublk_misc = {
3789
.minor = MISC_DYNAMIC_MINOR,
3790
.name = "ublk-control",
3791
.fops = &ublk_ctl_fops,
3792
};
3793
3794
static int __init ublk_init(void)
3795
{
3796
int ret;
3797
3798
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
3799
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
3800
BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
3801
3802
init_waitqueue_head(&ublk_idr_wq);
3803
3804
ret = misc_register(&ublk_misc);
3805
if (ret)
3806
return ret;
3807
3808
ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
3809
if (ret)
3810
goto unregister_mis;
3811
3812
ret = class_register(&ublk_chr_class);
3813
if (ret)
3814
goto free_chrdev_region;
3815
3816
return 0;
3817
3818
free_chrdev_region:
3819
unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3820
unregister_mis:
3821
misc_deregister(&ublk_misc);
3822
return ret;
3823
}
3824
3825
static void __exit ublk_exit(void)
3826
{
3827
struct ublk_device *ub;
3828
int id;
3829
3830
idr_for_each_entry(&ublk_index_idr, ub, id)
3831
ublk_remove(ub);
3832
3833
class_unregister(&ublk_chr_class);
3834
misc_deregister(&ublk_misc);
3835
3836
idr_destroy(&ublk_index_idr);
3837
unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3838
}
3839
3840
module_init(ublk_init);
3841
module_exit(ublk_exit);
3842
3843
static int ublk_set_max_unprivileged_ublks(const char *buf,
3844
const struct kernel_param *kp)
3845
{
3846
return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
3847
}
3848
3849
static int ublk_get_max_unprivileged_ublks(char *buf,
3850
const struct kernel_param *kp)
3851
{
3852
return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
3853
}
3854
3855
static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
3856
.set = ublk_set_max_unprivileged_ublks,
3857
.get = ublk_get_max_unprivileged_ublks,
3858
};
3859
3860
module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
3861
&unprivileged_ublks_max, 0644);
3862
MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
3863
3864
MODULE_AUTHOR("Ming Lei <[email protected]>");
3865
MODULE_DESCRIPTION("Userspace block device");
3866
MODULE_LICENSE("GPL");
3867
3868