Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/block/ublk_drv.c
50690 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* Userspace block device - block device which IO is handled from userspace
4
*
5
* Take full use of io_uring passthrough command for communicating with
6
* ublk userspace daemon(ublksrvd) for handling basic IO request.
7
*
8
* Copyright 2022 Ming Lei <[email protected]>
9
*
10
* (part of code stolen from loop.c)
11
*/
12
#include <linux/module.h>
13
#include <linux/moduleparam.h>
14
#include <linux/sched.h>
15
#include <linux/fs.h>
16
#include <linux/pagemap.h>
17
#include <linux/file.h>
18
#include <linux/stat.h>
19
#include <linux/errno.h>
20
#include <linux/major.h>
21
#include <linux/wait.h>
22
#include <linux/blkdev.h>
23
#include <linux/init.h>
24
#include <linux/swap.h>
25
#include <linux/slab.h>
26
#include <linux/compat.h>
27
#include <linux/mutex.h>
28
#include <linux/writeback.h>
29
#include <linux/completion.h>
30
#include <linux/highmem.h>
31
#include <linux/sysfs.h>
32
#include <linux/miscdevice.h>
33
#include <linux/falloc.h>
34
#include <linux/uio.h>
35
#include <linux/ioprio.h>
36
#include <linux/sched/mm.h>
37
#include <linux/uaccess.h>
38
#include <linux/cdev.h>
39
#include <linux/io_uring/cmd.h>
40
#include <linux/blk-mq.h>
41
#include <linux/delay.h>
42
#include <linux/mm.h>
43
#include <asm/page.h>
44
#include <linux/task_work.h>
45
#include <linux/namei.h>
46
#include <linux/kref.h>
47
#include <linux/kfifo.h>
48
#include <linux/blk-integrity.h>
49
#include <uapi/linux/fs.h>
50
#include <uapi/linux/ublk_cmd.h>
51
52
#define UBLK_MINORS (1U << MINORBITS)
53
54
#define UBLK_INVALID_BUF_IDX ((u16)-1)
55
56
/* private ioctl command mirror */
57
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
58
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
59
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
60
#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
61
62
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
63
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
64
65
/* All UBLK_F_* have to be included into UBLK_F_ALL */
66
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
67
| UBLK_F_URING_CMD_COMP_IN_TASK \
68
| UBLK_F_NEED_GET_DATA \
69
| UBLK_F_USER_RECOVERY \
70
| UBLK_F_USER_RECOVERY_REISSUE \
71
| UBLK_F_UNPRIVILEGED_DEV \
72
| UBLK_F_CMD_IOCTL_ENCODE \
73
| UBLK_F_USER_COPY \
74
| UBLK_F_ZONED \
75
| UBLK_F_USER_RECOVERY_FAIL_IO \
76
| UBLK_F_UPDATE_SIZE \
77
| UBLK_F_AUTO_BUF_REG \
78
| UBLK_F_QUIESCE \
79
| UBLK_F_PER_IO_DAEMON \
80
| UBLK_F_BUF_REG_OFF_DAEMON \
81
| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
82
| UBLK_F_SAFE_STOP_DEV \
83
| UBLK_F_BATCH_IO \
84
| UBLK_F_NO_AUTO_PART_SCAN)
85
86
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
87
| UBLK_F_USER_RECOVERY_REISSUE \
88
| UBLK_F_USER_RECOVERY_FAIL_IO)
89
90
/* All UBLK_PARAM_TYPE_* should be included here */
91
#define UBLK_PARAM_TYPE_ALL \
92
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
93
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
94
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
95
UBLK_PARAM_TYPE_INTEGRITY)
96
97
#define UBLK_BATCH_F_ALL \
98
(UBLK_BATCH_F_HAS_ZONE_LBA | \
99
UBLK_BATCH_F_HAS_BUF_ADDR | \
100
UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
101
102
/* ublk batch fetch uring_cmd */
103
struct ublk_batch_fetch_cmd {
104
struct list_head node;
105
struct io_uring_cmd *cmd;
106
unsigned short buf_group;
107
};
108
109
struct ublk_uring_cmd_pdu {
110
/*
111
* Store requests in same batch temporarily for queuing them to
112
* daemon context.
113
*
114
* It should have been stored to request payload, but we do want
115
* to avoid extra pre-allocation, and uring_cmd payload is always
116
* free for us
117
*/
118
union {
119
struct request *req;
120
struct request *req_list;
121
};
122
123
/*
124
* The following two are valid in this cmd whole lifetime, and
125
* setup in ublk uring_cmd handler
126
*/
127
struct ublk_queue *ubq;
128
129
union {
130
u16 tag;
131
struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
132
};
133
};
134
135
struct ublk_batch_io_data {
136
struct ublk_device *ub;
137
struct io_uring_cmd *cmd;
138
struct ublk_batch_io header;
139
unsigned int issue_flags;
140
struct io_comp_batch *iob;
141
};
142
143
/*
144
* io command is active: sqe cmd is received, and its cqe isn't done
145
*
146
* If the flag is set, the io command is owned by ublk driver, and waited
147
* for incoming blk-mq request from the ublk block device.
148
*
149
* If the flag is cleared, the io command will be completed, and owned by
150
* ublk server.
151
*/
152
#define UBLK_IO_FLAG_ACTIVE 0x01
153
154
/*
155
* IO command is completed via cqe, and it is being handled by ublksrv, and
156
* not committed yet
157
*
158
* Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
159
* cross verification
160
*/
161
#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
162
163
/*
164
* UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
165
* get data buffer address from ublksrv.
166
*
167
* Then, bio data could be copied into this data buffer for a WRITE request
168
* after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
169
*/
170
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
171
172
/*
173
* request buffer is registered automatically, so we have to unregister it
174
* before completing this request.
175
*
176
* io_uring will unregister buffer automatically for us during exiting.
177
*/
178
#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
179
180
/* atomic RW with ubq->cancel_lock */
181
#define UBLK_IO_FLAG_CANCELED 0x80000000
182
183
/*
184
* Initialize refcount to a large number to include any registered buffers.
185
* UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
186
* any buffers registered on the io daemon task.
187
*/
188
#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
189
190
/* used for UBLK_F_BATCH_IO only */
191
#define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1)
192
193
union ublk_io_buf {
194
__u64 addr;
195
struct ublk_auto_buf_reg auto_reg;
196
};
197
198
struct ublk_io {
199
union ublk_io_buf buf;
200
unsigned int flags;
201
int res;
202
203
union {
204
/* valid if UBLK_IO_FLAG_ACTIVE is set */
205
struct io_uring_cmd *cmd;
206
/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
207
struct request *req;
208
};
209
210
struct task_struct *task;
211
212
/*
213
* The number of uses of this I/O by the ublk server
214
* if user copy or zero copy are enabled:
215
* - UBLK_REFCOUNT_INIT from dispatch to the server
216
* until UBLK_IO_COMMIT_AND_FETCH_REQ
217
* - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
218
* - 1 for each io_uring registered buffer not registered on task
219
* The I/O can only be completed once all references are dropped.
220
* User copy and buffer registration operations are only permitted
221
* if the reference count is nonzero.
222
*/
223
refcount_t ref;
224
/* Count of buffers registered on task and not yet unregistered */
225
unsigned task_registered_buffers;
226
227
void *buf_ctx_handle;
228
spinlock_t lock;
229
} ____cacheline_aligned_in_smp;
230
231
struct ublk_queue {
232
int q_id;
233
int q_depth;
234
235
unsigned long flags;
236
struct ublksrv_io_desc *io_cmd_buf;
237
238
bool force_abort;
239
bool canceling;
240
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
241
spinlock_t cancel_lock;
242
struct ublk_device *dev;
243
u32 nr_io_ready;
244
245
/*
246
* For supporting UBLK_F_BATCH_IO only.
247
*
248
* Inflight ublk request tag is saved in this fifo
249
*
250
* There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
251
* so lock is required for storing request tag to fifo
252
*
253
* Make sure just one reader for fetching request from task work
254
* function to ublk server, so no need to grab the lock in reader
255
* side.
256
*
257
* Batch I/O State Management:
258
*
259
* The batch I/O system uses implicit state management based on the
260
* combination of three key variables below.
261
*
262
* - IDLE: list_empty(&fcmd_head) && !active_fcmd
263
* No fetch commands available, events queue in evts_fifo
264
*
265
* - READY: !list_empty(&fcmd_head) && !active_fcmd
266
* Fetch commands available but none processing events
267
*
268
* - ACTIVE: active_fcmd
269
* One fetch command actively processing events from evts_fifo
270
*
271
* Key Invariants:
272
* - At most one active_fcmd at any time (single reader)
273
* - active_fcmd is always from fcmd_head list when non-NULL
274
* - evts_fifo can be read locklessly by the single active reader
275
* - All state transitions require evts_lock protection
276
* - Multiple writers to evts_fifo require lock protection
277
*/
278
struct {
279
DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
280
spinlock_t evts_lock;
281
282
/* List of fetch commands available to process events */
283
struct list_head fcmd_head;
284
285
/* Currently active fetch command (NULL = none active) */
286
struct ublk_batch_fetch_cmd *active_fcmd;
287
}____cacheline_aligned_in_smp;
288
289
struct ublk_io ios[] __counted_by(q_depth);
290
};
291
292
struct ublk_device {
293
struct gendisk *ub_disk;
294
295
struct ublksrv_ctrl_dev_info dev_info;
296
297
struct blk_mq_tag_set tag_set;
298
299
struct cdev cdev;
300
struct device cdev_dev;
301
302
#define UB_STATE_OPEN 0
303
#define UB_STATE_USED 1
304
#define UB_STATE_DELETED 2
305
unsigned long state;
306
int ub_number;
307
308
struct mutex mutex;
309
310
spinlock_t lock;
311
struct mm_struct *mm;
312
313
struct ublk_params params;
314
315
struct completion completion;
316
u32 nr_queue_ready;
317
bool unprivileged_daemons;
318
struct mutex cancel_mutex;
319
bool canceling;
320
pid_t ublksrv_tgid;
321
struct delayed_work exit_work;
322
struct work_struct partition_scan_work;
323
324
bool block_open; /* protected by open_mutex */
325
326
struct ublk_queue *queues[];
327
};
328
329
/* header of ublk_params */
330
struct ublk_params_header {
331
__u32 len;
332
__u32 types;
333
};
334
335
static void ublk_io_release(void *priv);
336
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
337
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
338
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
339
u16 q_id, u16 tag, struct ublk_io *io);
340
static inline unsigned int ublk_req_build_flags(struct request *req);
341
static void ublk_batch_dispatch(struct ublk_queue *ubq,
342
const struct ublk_batch_io_data *data,
343
struct ublk_batch_fetch_cmd *fcmd);
344
345
static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
346
{
347
return ub->dev_info.flags & UBLK_F_BATCH_IO;
348
}
349
350
static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
351
{
352
return ubq->flags & UBLK_F_BATCH_IO;
353
}
354
355
static inline void ublk_io_lock(struct ublk_io *io)
356
{
357
spin_lock(&io->lock);
358
}
359
360
static inline void ublk_io_unlock(struct ublk_io *io)
361
{
362
spin_unlock(&io->lock);
363
}
364
365
/* Initialize the event queue */
366
static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
367
int numa_node)
368
{
369
spin_lock_init(&q->evts_lock);
370
return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
371
}
372
373
/* Check if event queue is empty */
374
static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
375
{
376
return kfifo_is_empty(&q->evts_fifo);
377
}
378
379
static inline void ublk_io_evts_deinit(struct ublk_queue *q)
380
{
381
WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
382
kfifo_free(&q->evts_fifo);
383
}
384
385
static inline struct ublksrv_io_desc *
386
ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
387
{
388
return &ubq->io_cmd_buf[tag];
389
}
390
391
static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
392
{
393
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
394
}
395
396
static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
397
{
398
return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
399
}
400
401
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
402
{
403
return ubq->flags & UBLK_F_AUTO_BUF_REG;
404
}
405
406
static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
407
{
408
return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
409
}
410
411
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
412
{
413
return ubq->flags & UBLK_F_USER_COPY;
414
}
415
416
static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
417
{
418
return ub->dev_info.flags & UBLK_F_USER_COPY;
419
}
420
421
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
422
{
423
return ub->dev_info.flags & UBLK_F_ZONED;
424
}
425
426
static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
427
{
428
return ubq->flags & UBLK_F_ZONED;
429
}
430
431
static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
432
{
433
return ub->dev_info.flags & UBLK_F_INTEGRITY;
434
}
435
436
#ifdef CONFIG_BLK_DEV_ZONED
437
438
struct ublk_zoned_report_desc {
439
__u64 sector;
440
__u32 operation;
441
__u32 nr_zones;
442
};
443
444
static DEFINE_XARRAY(ublk_zoned_report_descs);
445
446
static int ublk_zoned_insert_report_desc(const struct request *req,
447
struct ublk_zoned_report_desc *desc)
448
{
449
return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
450
desc, GFP_KERNEL);
451
}
452
453
static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
454
const struct request *req)
455
{
456
return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
457
}
458
459
static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
460
const struct request *req)
461
{
462
return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
463
}
464
465
static int ublk_get_nr_zones(const struct ublk_device *ub)
466
{
467
const struct ublk_param_basic *p = &ub->params.basic;
468
469
/* Zone size is a power of 2 */
470
return p->dev_sectors >> ilog2(p->chunk_sectors);
471
}
472
473
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
474
{
475
return blk_revalidate_disk_zones(ub->ub_disk);
476
}
477
478
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
479
{
480
const struct ublk_param_zoned *p = &ub->params.zoned;
481
int nr_zones;
482
483
if (!ublk_dev_is_zoned(ub))
484
return -EINVAL;
485
486
if (!p->max_zone_append_sectors)
487
return -EINVAL;
488
489
nr_zones = ublk_get_nr_zones(ub);
490
491
if (p->max_active_zones > nr_zones)
492
return -EINVAL;
493
494
if (p->max_open_zones > nr_zones)
495
return -EINVAL;
496
497
return 0;
498
}
499
500
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
501
{
502
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
503
}
504
505
/* Based on virtblk_alloc_report_buffer */
506
static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
507
unsigned int nr_zones, size_t *buflen)
508
{
509
struct request_queue *q = ublk->ub_disk->queue;
510
size_t bufsize;
511
void *buf;
512
513
nr_zones = min_t(unsigned int, nr_zones,
514
ublk->ub_disk->nr_zones);
515
516
bufsize = nr_zones * sizeof(struct blk_zone);
517
bufsize =
518
min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
519
520
while (bufsize >= sizeof(struct blk_zone)) {
521
buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
522
if (buf) {
523
*buflen = bufsize;
524
return buf;
525
}
526
bufsize >>= 1;
527
}
528
529
*buflen = 0;
530
return NULL;
531
}
532
533
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
534
unsigned int nr_zones, struct blk_report_zones_args *args)
535
{
536
struct ublk_device *ub = disk->private_data;
537
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
538
unsigned int first_zone = sector >> ilog2(zone_size_sectors);
539
unsigned int done_zones = 0;
540
unsigned int max_zones_per_request;
541
int ret;
542
struct blk_zone *buffer;
543
size_t buffer_length;
544
545
nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
546
nr_zones);
547
548
buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
549
if (!buffer)
550
return -ENOMEM;
551
552
max_zones_per_request = buffer_length / sizeof(struct blk_zone);
553
554
while (done_zones < nr_zones) {
555
unsigned int remaining_zones = nr_zones - done_zones;
556
unsigned int zones_in_request =
557
min_t(unsigned int, remaining_zones, max_zones_per_request);
558
struct request *req;
559
struct ublk_zoned_report_desc desc;
560
blk_status_t status;
561
562
memset(buffer, 0, buffer_length);
563
564
req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
565
if (IS_ERR(req)) {
566
ret = PTR_ERR(req);
567
goto out;
568
}
569
570
desc.operation = UBLK_IO_OP_REPORT_ZONES;
571
desc.sector = sector;
572
desc.nr_zones = zones_in_request;
573
ret = ublk_zoned_insert_report_desc(req, &desc);
574
if (ret)
575
goto free_req;
576
577
ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
578
if (ret)
579
goto erase_desc;
580
581
status = blk_execute_rq(req, 0);
582
ret = blk_status_to_errno(status);
583
erase_desc:
584
ublk_zoned_erase_report_desc(req);
585
free_req:
586
blk_mq_free_request(req);
587
if (ret)
588
goto out;
589
590
for (unsigned int i = 0; i < zones_in_request; i++) {
591
struct blk_zone *zone = buffer + i;
592
593
/* A zero length zone means no more zones in this response */
594
if (!zone->len)
595
break;
596
597
ret = disk_report_zone(disk, zone, i, args);
598
if (ret)
599
goto out;
600
601
done_zones++;
602
sector += zone_size_sectors;
603
604
}
605
}
606
607
ret = done_zones;
608
609
out:
610
kvfree(buffer);
611
return ret;
612
}
613
614
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
615
struct request *req)
616
{
617
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
618
struct ublk_io *io = &ubq->ios[req->tag];
619
struct ublk_zoned_report_desc *desc;
620
u32 ublk_op;
621
622
switch (req_op(req)) {
623
case REQ_OP_ZONE_OPEN:
624
ublk_op = UBLK_IO_OP_ZONE_OPEN;
625
break;
626
case REQ_OP_ZONE_CLOSE:
627
ublk_op = UBLK_IO_OP_ZONE_CLOSE;
628
break;
629
case REQ_OP_ZONE_FINISH:
630
ublk_op = UBLK_IO_OP_ZONE_FINISH;
631
break;
632
case REQ_OP_ZONE_RESET:
633
ublk_op = UBLK_IO_OP_ZONE_RESET;
634
break;
635
case REQ_OP_ZONE_APPEND:
636
ublk_op = UBLK_IO_OP_ZONE_APPEND;
637
break;
638
case REQ_OP_ZONE_RESET_ALL:
639
ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
640
break;
641
case REQ_OP_DRV_IN:
642
desc = ublk_zoned_get_report_desc(req);
643
if (!desc)
644
return BLK_STS_IOERR;
645
ublk_op = desc->operation;
646
switch (ublk_op) {
647
case UBLK_IO_OP_REPORT_ZONES:
648
iod->op_flags = ublk_op | ublk_req_build_flags(req);
649
iod->nr_zones = desc->nr_zones;
650
iod->start_sector = desc->sector;
651
return BLK_STS_OK;
652
default:
653
return BLK_STS_IOERR;
654
}
655
case REQ_OP_DRV_OUT:
656
/* We do not support drv_out */
657
return BLK_STS_NOTSUPP;
658
default:
659
return BLK_STS_IOERR;
660
}
661
662
iod->op_flags = ublk_op | ublk_req_build_flags(req);
663
iod->nr_sectors = blk_rq_sectors(req);
664
iod->start_sector = blk_rq_pos(req);
665
iod->addr = io->buf.addr;
666
667
return BLK_STS_OK;
668
}
669
670
#else
671
672
#define ublk_report_zones (NULL)
673
674
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
675
{
676
return -EOPNOTSUPP;
677
}
678
679
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
680
{
681
}
682
683
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
684
{
685
return 0;
686
}
687
688
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
689
struct request *req)
690
{
691
return BLK_STS_NOTSUPP;
692
}
693
694
#endif
695
696
static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
697
bool need_map, struct io_comp_batch *iob);
698
699
static dev_t ublk_chr_devt;
700
static const struct class ublk_chr_class = {
701
.name = "ublk-char",
702
};
703
704
static DEFINE_IDR(ublk_index_idr);
705
static DEFINE_SPINLOCK(ublk_idr_lock);
706
static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
707
708
static DEFINE_MUTEX(ublk_ctl_mutex);
709
710
static struct ublk_batch_fetch_cmd *
711
ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
712
{
713
struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO);
714
715
if (fcmd) {
716
fcmd->cmd = cmd;
717
fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
718
}
719
return fcmd;
720
}
721
722
static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
723
{
724
kfree(fcmd);
725
}
726
727
static void __ublk_release_fcmd(struct ublk_queue *ubq)
728
{
729
WRITE_ONCE(ubq->active_fcmd, NULL);
730
}
731
732
/*
733
* Nothing can move on, so clear ->active_fcmd, and the caller should stop
734
* dispatching
735
*/
736
static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
737
const struct ublk_batch_io_data *data,
738
struct ublk_batch_fetch_cmd *fcmd,
739
int res)
740
{
741
spin_lock(&ubq->evts_lock);
742
list_del_init(&fcmd->node);
743
WARN_ON_ONCE(fcmd != ubq->active_fcmd);
744
__ublk_release_fcmd(ubq);
745
spin_unlock(&ubq->evts_lock);
746
747
io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
748
ublk_batch_free_fcmd(fcmd);
749
}
750
751
static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
752
struct io_br_sel *sel,
753
unsigned int issue_flags)
754
{
755
if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
756
return -ENOBUFS;
757
return 0;
758
}
759
760
static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
761
void __user *buf, const u16 *tag_buf,
762
unsigned int len)
763
{
764
if (copy_to_user(buf, tag_buf, len))
765
return -EFAULT;
766
return len;
767
}
768
769
#define UBLK_MAX_UBLKS UBLK_MINORS
770
771
/*
772
* Max unprivileged ublk devices allowed to add
773
*
774
* It can be extended to one per-user limit in future or even controlled
775
* by cgroup.
776
*/
777
static unsigned int unprivileged_ublks_max = 64;
778
static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
779
780
static struct miscdevice ublk_misc;
781
782
static inline unsigned ublk_pos_to_hwq(loff_t pos)
783
{
784
return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
785
UBLK_QID_BITS_MASK;
786
}
787
788
static inline unsigned ublk_pos_to_buf_off(loff_t pos)
789
{
790
return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
791
}
792
793
static inline unsigned ublk_pos_to_tag(loff_t pos)
794
{
795
return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
796
UBLK_TAG_BITS_MASK;
797
}
798
799
static void ublk_dev_param_basic_apply(struct ublk_device *ub)
800
{
801
const struct ublk_param_basic *p = &ub->params.basic;
802
803
if (p->attrs & UBLK_ATTR_READ_ONLY)
804
set_disk_ro(ub->ub_disk, true);
805
806
set_capacity(ub->ub_disk, p->dev_sectors);
807
}
808
809
static int ublk_integrity_flags(u32 flags)
810
{
811
int ret_flags = 0;
812
813
if (flags & LBMD_PI_CAP_INTEGRITY) {
814
flags &= ~LBMD_PI_CAP_INTEGRITY;
815
ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
816
}
817
if (flags & LBMD_PI_CAP_REFTAG) {
818
flags &= ~LBMD_PI_CAP_REFTAG;
819
ret_flags |= BLK_INTEGRITY_REF_TAG;
820
}
821
return flags ? -EINVAL : ret_flags;
822
}
823
824
static int ublk_integrity_pi_tuple_size(u8 csum_type)
825
{
826
switch (csum_type) {
827
case LBMD_PI_CSUM_NONE:
828
return 0;
829
case LBMD_PI_CSUM_IP:
830
case LBMD_PI_CSUM_CRC16_T10DIF:
831
return 8;
832
case LBMD_PI_CSUM_CRC64_NVME:
833
return 16;
834
default:
835
return -EINVAL;
836
}
837
}
838
839
static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
840
{
841
switch (csum_type) {
842
case LBMD_PI_CSUM_NONE:
843
return BLK_INTEGRITY_CSUM_NONE;
844
case LBMD_PI_CSUM_IP:
845
return BLK_INTEGRITY_CSUM_IP;
846
case LBMD_PI_CSUM_CRC16_T10DIF:
847
return BLK_INTEGRITY_CSUM_CRC;
848
case LBMD_PI_CSUM_CRC64_NVME:
849
return BLK_INTEGRITY_CSUM_CRC64;
850
default:
851
WARN_ON_ONCE(1);
852
return BLK_INTEGRITY_CSUM_NONE;
853
}
854
}
855
856
static int ublk_validate_params(const struct ublk_device *ub)
857
{
858
/* basic param is the only one which must be set */
859
if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
860
const struct ublk_param_basic *p = &ub->params.basic;
861
862
if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
863
return -EINVAL;
864
865
if (p->logical_bs_shift > p->physical_bs_shift)
866
return -EINVAL;
867
868
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
869
return -EINVAL;
870
871
if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
872
return -EINVAL;
873
} else
874
return -EINVAL;
875
876
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
877
const struct ublk_param_discard *p = &ub->params.discard;
878
879
/* So far, only support single segment discard */
880
if (p->max_discard_sectors && p->max_discard_segments != 1)
881
return -EINVAL;
882
883
if (!p->discard_granularity)
884
return -EINVAL;
885
}
886
887
/* dev_t is read-only */
888
if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
889
return -EINVAL;
890
891
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
892
return ublk_dev_param_zoned_validate(ub);
893
else if (ublk_dev_is_zoned(ub))
894
return -EINVAL;
895
896
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
897
const struct ublk_param_dma_align *p = &ub->params.dma;
898
899
if (p->alignment >= PAGE_SIZE)
900
return -EINVAL;
901
902
if (!is_power_of_2(p->alignment + 1))
903
return -EINVAL;
904
}
905
906
if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
907
const struct ublk_param_segment *p = &ub->params.seg;
908
909
if (!is_power_of_2(p->seg_boundary_mask + 1))
910
return -EINVAL;
911
912
if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
913
return -EINVAL;
914
if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
915
return -EINVAL;
916
}
917
918
if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
919
const struct ublk_param_integrity *p = &ub->params.integrity;
920
int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
921
int flags = ublk_integrity_flags(p->flags);
922
923
if (!ublk_dev_support_integrity(ub))
924
return -EINVAL;
925
if (flags < 0)
926
return flags;
927
if (pi_tuple_size < 0)
928
return pi_tuple_size;
929
if (!p->metadata_size)
930
return -EINVAL;
931
if (p->csum_type == LBMD_PI_CSUM_NONE &&
932
p->flags & LBMD_PI_CAP_REFTAG)
933
return -EINVAL;
934
if (p->pi_offset + pi_tuple_size > p->metadata_size)
935
return -EINVAL;
936
if (p->interval_exp < SECTOR_SHIFT ||
937
p->interval_exp > ub->params.basic.logical_bs_shift)
938
return -EINVAL;
939
}
940
941
return 0;
942
}
943
944
static void ublk_apply_params(struct ublk_device *ub)
945
{
946
ublk_dev_param_basic_apply(ub);
947
948
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
949
ublk_dev_param_zoned_apply(ub);
950
}
951
952
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
953
{
954
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
955
!ublk_support_auto_buf_reg(ubq);
956
}
957
958
static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
959
{
960
return !ublk_dev_support_user_copy(ub) &&
961
!ublk_dev_support_zero_copy(ub) &&
962
!ublk_dev_support_auto_buf_reg(ub);
963
}
964
965
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
966
{
967
/*
968
* read()/write() is involved in user copy, so request reference
969
* has to be grabbed
970
*
971
* for zero copy, request buffer need to be registered to io_uring
972
* buffer table, so reference is needed
973
*
974
* For auto buffer register, ublk server still may issue
975
* UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
976
* so reference is required too.
977
*/
978
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
979
ublk_support_auto_buf_reg(ubq);
980
}
981
982
static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
983
{
984
return ublk_dev_support_user_copy(ub) ||
985
ublk_dev_support_zero_copy(ub) ||
986
ublk_dev_support_auto_buf_reg(ub);
987
}
988
989
/*
990
* ublk IO Reference Counting Design
991
* ==================================
992
*
993
* For user-copy and zero-copy modes, ublk uses a split reference model with
994
* two counters that together track IO lifetime:
995
*
996
* - io->ref: refcount for off-task buffer registrations and user-copy ops
997
* - io->task_registered_buffers: count of buffers registered on the IO task
998
*
999
* Key Invariant:
1000
* --------------
1001
* When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1002
* the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1003
* when no active references exist. After IO completion, both counters become
1004
* zero. For I/Os not currently dispatched to the ublk server, both ref and
1005
* task_registered_buffers are 0.
1006
*
1007
* This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1008
* exit to determine if all references have been released.
1009
*
1010
* Why Split Counters:
1011
* -------------------
1012
* Buffers registered on the IO daemon task can use the lightweight
1013
* task_registered_buffers counter (simple increment/decrement) instead of
1014
* atomic refcount operations. The ublk_io_release() callback checks if
1015
* current == io->task to decide which counter to update.
1016
*
1017
* This optimization only applies before IO completion. At completion,
1018
* ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1019
* After that, all subsequent buffer unregistrations must use the atomic ref
1020
* since they may be releasing the last reference.
1021
*
1022
* Reference Lifecycle:
1023
* --------------------
1024
* 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1025
*
1026
* 2. During IO processing:
1027
* - On-task buffer reg: task_registered_buffers++ (no ref change)
1028
* - Off-task buffer reg: ref++ via ublk_get_req_ref()
1029
* - Buffer unregister callback (ublk_io_release):
1030
* * If on-task: task_registered_buffers--
1031
* * If off-task: ref-- via ublk_put_req_ref()
1032
*
1033
* 3. ublk_sub_req_ref() at IO completion:
1034
* - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1035
* - Subtracts sub_refs from ref and zeroes task_registered_buffers
1036
* - This effectively collapses task_registered_buffers into the atomic ref,
1037
* accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1038
* buffers that were already counted
1039
*
1040
* Example (zero-copy, register on-task, unregister off-task):
1041
* - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1042
* - Register buffer on-task: task_registered_buffers = 1
1043
* - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1044
* - Completion via ublk_sub_req_ref():
1045
* sub_refs = UBLK_REFCOUNT_INIT - 1,
1046
* ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1047
*
1048
* Example (auto buffer registration):
1049
* Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1050
*
1051
* - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1052
* - Buffer unregister: task_registered_buffers-- (becomes 0)
1053
* - Completion via ublk_sub_req_ref():
1054
* sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1055
*
1056
* Example (zero-copy, ublk server killed):
1057
* When daemon is killed, io_uring cleanup unregisters buffers off-task.
1058
* ublk_check_and_reset_active_ref() waits for the invariant to hold.
1059
*
1060
* - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1061
* - Register buffer on-task: task_registered_buffers = 1
1062
* - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1063
* ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1064
* - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1065
* - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1066
* ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1067
* and abort pending requests
1068
*
1069
* Batch IO Special Case:
1070
* ----------------------
1071
* In batch IO mode, io->task is NULL. This means ublk_io_release() always
1072
* takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1073
* task_registered_buffers counter still tracks registered buffers for the
1074
* invariant check, even though the callback doesn't decrement it.
1075
*
1076
* Note: updating task_registered_buffers is protected by io->lock.
1077
*/
1078
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1079
struct ublk_io *io)
1080
{
1081
if (ublk_need_req_ref(ubq))
1082
refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1083
}
1084
1085
static inline bool ublk_get_req_ref(struct ublk_io *io)
1086
{
1087
return refcount_inc_not_zero(&io->ref);
1088
}
1089
1090
static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1091
{
1092
if (!refcount_dec_and_test(&io->ref))
1093
return;
1094
1095
/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1096
__ublk_complete_rq(req, io, false, NULL);
1097
}
1098
1099
static inline bool ublk_sub_req_ref(struct ublk_io *io)
1100
{
1101
unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1102
1103
io->task_registered_buffers = 0;
1104
return refcount_sub_and_test(sub_refs, &io->ref);
1105
}
1106
1107
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1108
{
1109
return ubq->flags & UBLK_F_NEED_GET_DATA;
1110
}
1111
1112
static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1113
{
1114
return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1115
}
1116
1117
/* Called in slow path only, keep it noinline for trace purpose */
1118
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1119
{
1120
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1121
return ub;
1122
return NULL;
1123
}
1124
1125
/* Called in slow path only, keep it noinline for trace purpose */
1126
static noinline void ublk_put_device(struct ublk_device *ub)
1127
{
1128
put_device(&ub->cdev_dev);
1129
}
1130
1131
static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1132
int qid)
1133
{
1134
return dev->queues[qid];
1135
}
1136
1137
static inline bool ublk_rq_has_data(const struct request *rq)
1138
{
1139
return bio_has_data(rq->bio);
1140
}
1141
1142
static inline struct ublksrv_io_desc *
1143
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1144
{
1145
return ublk_get_queue(ub, q_id)->io_cmd_buf;
1146
}
1147
1148
static inline int __ublk_queue_cmd_buf_size(int depth)
1149
{
1150
return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1151
}
1152
1153
static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1154
{
1155
return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1156
}
1157
1158
static int ublk_max_cmd_buf_size(void)
1159
{
1160
return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1161
}
1162
1163
/*
1164
* Should I/O outstanding to the ublk server when it exits be reissued?
1165
* If not, outstanding I/O will get errors.
1166
*/
1167
static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1168
{
1169
return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1170
(ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1171
}
1172
1173
/*
1174
* Should I/O issued while there is no ublk server queue? If not, I/O
1175
* issued while there is no ublk server will get errors.
1176
*/
1177
static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1178
{
1179
return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1180
!(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1181
}
1182
1183
/*
1184
* Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1185
* of the device flags for smaller cache footprint - better for fast
1186
* paths.
1187
*/
1188
static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1189
{
1190
return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1191
!(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1192
}
1193
1194
/*
1195
* Should ublk devices be stopped (i.e. no recovery possible) when the
1196
* ublk server exits? If not, devices can be used again by a future
1197
* incarnation of a ublk server via the start_recovery/end_recovery
1198
* commands.
1199
*/
1200
static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1201
{
1202
return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1203
}
1204
1205
static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1206
{
1207
return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1208
ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1209
}
1210
1211
static void ublk_free_disk(struct gendisk *disk)
1212
{
1213
struct ublk_device *ub = disk->private_data;
1214
1215
clear_bit(UB_STATE_USED, &ub->state);
1216
ublk_put_device(ub);
1217
}
1218
1219
static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1220
unsigned int *owner_gid)
1221
{
1222
kuid_t uid;
1223
kgid_t gid;
1224
1225
current_uid_gid(&uid, &gid);
1226
1227
*owner_uid = from_kuid(&init_user_ns, uid);
1228
*owner_gid = from_kgid(&init_user_ns, gid);
1229
}
1230
1231
static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1232
{
1233
struct ublk_device *ub = disk->private_data;
1234
1235
if (capable(CAP_SYS_ADMIN))
1236
return 0;
1237
1238
/*
1239
* If it is one unprivileged device, only owner can open
1240
* the disk. Otherwise it could be one trap made by one
1241
* evil user who grants this disk's privileges to other
1242
* users deliberately.
1243
*
1244
* This way is reasonable too given anyone can create
1245
* unprivileged device, and no need other's grant.
1246
*/
1247
if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1248
unsigned int curr_uid, curr_gid;
1249
1250
ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1251
1252
if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1253
ub->dev_info.owner_gid)
1254
return -EPERM;
1255
}
1256
1257
if (ub->block_open)
1258
return -ENXIO;
1259
1260
return 0;
1261
}
1262
1263
static const struct block_device_operations ub_fops = {
1264
.owner = THIS_MODULE,
1265
.open = ublk_open,
1266
.free_disk = ublk_free_disk,
1267
.report_zones = ublk_report_zones,
1268
};
1269
1270
static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1271
struct iov_iter *uiter, int dir, size_t *done)
1272
{
1273
unsigned len;
1274
void *bv_buf;
1275
size_t copied;
1276
1277
if (*offset >= bv->bv_len) {
1278
*offset -= bv->bv_len;
1279
return true;
1280
}
1281
1282
len = bv->bv_len - *offset;
1283
bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1284
if (dir == ITER_DEST)
1285
copied = copy_to_iter(bv_buf, len, uiter);
1286
else
1287
copied = copy_from_iter(bv_buf, len, uiter);
1288
1289
kunmap_local(bv_buf);
1290
1291
*done += copied;
1292
if (copied < len)
1293
return false;
1294
1295
*offset = 0;
1296
return true;
1297
}
1298
1299
/*
1300
* Copy data between request pages and io_iter, and 'offset'
1301
* is the start point of linear offset of request.
1302
*/
1303
static size_t ublk_copy_user_pages(const struct request *req,
1304
unsigned offset, struct iov_iter *uiter, int dir)
1305
{
1306
struct req_iterator iter;
1307
struct bio_vec bv;
1308
size_t done = 0;
1309
1310
rq_for_each_segment(bv, req, iter) {
1311
if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1312
break;
1313
}
1314
return done;
1315
}
1316
1317
#ifdef CONFIG_BLK_DEV_INTEGRITY
1318
static size_t ublk_copy_user_integrity(const struct request *req,
1319
unsigned offset, struct iov_iter *uiter, int dir)
1320
{
1321
size_t done = 0;
1322
struct bio *bio = req->bio;
1323
struct bvec_iter iter;
1324
struct bio_vec iv;
1325
1326
if (!blk_integrity_rq(req))
1327
return 0;
1328
1329
bio_for_each_integrity_vec(iv, bio, iter) {
1330
if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1331
break;
1332
}
1333
1334
return done;
1335
}
1336
#else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1337
static size_t ublk_copy_user_integrity(const struct request *req,
1338
unsigned offset, struct iov_iter *uiter, int dir)
1339
{
1340
return 0;
1341
}
1342
#endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1343
1344
static inline bool ublk_need_map_req(const struct request *req)
1345
{
1346
return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1347
}
1348
1349
static inline bool ublk_need_unmap_req(const struct request *req)
1350
{
1351
return ublk_rq_has_data(req) &&
1352
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1353
}
1354
1355
static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1356
const struct request *req,
1357
const struct ublk_io *io)
1358
{
1359
const unsigned int rq_bytes = blk_rq_bytes(req);
1360
1361
if (!ublk_need_map_io(ubq))
1362
return rq_bytes;
1363
1364
/*
1365
* no zero copy, we delay copy WRITE request data into ublksrv
1366
* context and the big benefit is that pinning pages in current
1367
* context is pretty fast, see ublk_pin_user_pages
1368
*/
1369
if (ublk_need_map_req(req)) {
1370
struct iov_iter iter;
1371
const int dir = ITER_DEST;
1372
1373
import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1374
return ublk_copy_user_pages(req, 0, &iter, dir);
1375
}
1376
return rq_bytes;
1377
}
1378
1379
static unsigned int ublk_unmap_io(bool need_map,
1380
const struct request *req,
1381
const struct ublk_io *io)
1382
{
1383
const unsigned int rq_bytes = blk_rq_bytes(req);
1384
1385
if (!need_map)
1386
return rq_bytes;
1387
1388
if (ublk_need_unmap_req(req)) {
1389
struct iov_iter iter;
1390
const int dir = ITER_SOURCE;
1391
1392
WARN_ON_ONCE(io->res > rq_bytes);
1393
1394
import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1395
return ublk_copy_user_pages(req, 0, &iter, dir);
1396
}
1397
return rq_bytes;
1398
}
1399
1400
static inline unsigned int ublk_req_build_flags(struct request *req)
1401
{
1402
unsigned flags = 0;
1403
1404
if (req->cmd_flags & REQ_FAILFAST_DEV)
1405
flags |= UBLK_IO_F_FAILFAST_DEV;
1406
1407
if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1408
flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1409
1410
if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1411
flags |= UBLK_IO_F_FAILFAST_DRIVER;
1412
1413
if (req->cmd_flags & REQ_META)
1414
flags |= UBLK_IO_F_META;
1415
1416
if (req->cmd_flags & REQ_FUA)
1417
flags |= UBLK_IO_F_FUA;
1418
1419
if (req->cmd_flags & REQ_NOUNMAP)
1420
flags |= UBLK_IO_F_NOUNMAP;
1421
1422
if (req->cmd_flags & REQ_SWAP)
1423
flags |= UBLK_IO_F_SWAP;
1424
1425
if (blk_integrity_rq(req))
1426
flags |= UBLK_IO_F_INTEGRITY;
1427
1428
return flags;
1429
}
1430
1431
static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1432
{
1433
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1434
struct ublk_io *io = &ubq->ios[req->tag];
1435
u32 ublk_op;
1436
1437
switch (req_op(req)) {
1438
case REQ_OP_READ:
1439
ublk_op = UBLK_IO_OP_READ;
1440
break;
1441
case REQ_OP_WRITE:
1442
ublk_op = UBLK_IO_OP_WRITE;
1443
break;
1444
case REQ_OP_FLUSH:
1445
ublk_op = UBLK_IO_OP_FLUSH;
1446
break;
1447
case REQ_OP_DISCARD:
1448
ublk_op = UBLK_IO_OP_DISCARD;
1449
break;
1450
case REQ_OP_WRITE_ZEROES:
1451
ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1452
break;
1453
default:
1454
if (ublk_queue_is_zoned(ubq))
1455
return ublk_setup_iod_zoned(ubq, req);
1456
return BLK_STS_IOERR;
1457
}
1458
1459
/* need to translate since kernel may change */
1460
iod->op_flags = ublk_op | ublk_req_build_flags(req);
1461
iod->nr_sectors = blk_rq_sectors(req);
1462
iod->start_sector = blk_rq_pos(req);
1463
iod->addr = io->buf.addr;
1464
1465
return BLK_STS_OK;
1466
}
1467
1468
static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1469
struct io_uring_cmd *ioucmd)
1470
{
1471
return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1472
}
1473
1474
static void ublk_end_request(struct request *req, blk_status_t error)
1475
{
1476
local_bh_disable();
1477
blk_mq_end_request(req, error);
1478
local_bh_enable();
1479
}
1480
1481
/* todo: handle partial completion */
1482
static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1483
bool need_map, struct io_comp_batch *iob)
1484
{
1485
unsigned int unmapped_bytes;
1486
blk_status_t res = BLK_STS_OK;
1487
bool requeue;
1488
1489
/* failed read IO if nothing is read */
1490
if (!io->res && req_op(req) == REQ_OP_READ)
1491
io->res = -EIO;
1492
1493
if (io->res < 0) {
1494
res = errno_to_blk_status(io->res);
1495
goto exit;
1496
}
1497
1498
/*
1499
* FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1500
* directly.
1501
*
1502
* Both the two needn't unmap.
1503
*/
1504
if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1505
req_op(req) != REQ_OP_DRV_IN)
1506
goto exit;
1507
1508
/* for READ request, writing data in iod->addr to rq buffers */
1509
unmapped_bytes = ublk_unmap_io(need_map, req, io);
1510
1511
/*
1512
* Extremely impossible since we got data filled in just before
1513
*
1514
* Re-read simply for this unlikely case.
1515
*/
1516
if (unlikely(unmapped_bytes < io->res))
1517
io->res = unmapped_bytes;
1518
1519
/*
1520
* Run bio->bi_end_io() with softirqs disabled. If the final fput
1521
* happens off this path, then that will prevent ublk's blkdev_release()
1522
* from being called on current's task work, see fput() implementation.
1523
*
1524
* Otherwise, ublk server may not provide forward progress in case of
1525
* reading the partition table from bdev_open() with disk->open_mutex
1526
* held, and causes dead lock as we could already be holding
1527
* disk->open_mutex here.
1528
*
1529
* Preferably we would not be doing IO with a mutex held that is also
1530
* used for release, but this work-around will suffice for now.
1531
*/
1532
local_bh_disable();
1533
requeue = blk_update_request(req, BLK_STS_OK, io->res);
1534
local_bh_enable();
1535
if (requeue)
1536
blk_mq_requeue_request(req, true);
1537
else if (likely(!blk_should_fake_timeout(req->q))) {
1538
if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1539
return;
1540
__blk_mq_end_request(req, BLK_STS_OK);
1541
}
1542
1543
return;
1544
exit:
1545
ublk_end_request(req, res);
1546
}
1547
1548
static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1549
struct request *req)
1550
{
1551
/* read cmd first because req will overwrite it */
1552
struct io_uring_cmd *cmd = io->cmd;
1553
1554
/* mark this cmd owned by ublksrv */
1555
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1556
1557
/*
1558
* clear ACTIVE since we are done with this sqe/cmd slot
1559
* We can only accept io cmd in case of being not active.
1560
*/
1561
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1562
1563
io->req = req;
1564
return cmd;
1565
}
1566
1567
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1568
int res, unsigned issue_flags)
1569
{
1570
struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1571
1572
/* tell ublksrv one io request is coming */
1573
io_uring_cmd_done(cmd, res, issue_flags);
1574
}
1575
1576
#define UBLK_REQUEUE_DELAY_MS 3
1577
1578
static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1579
struct request *rq)
1580
{
1581
/* We cannot process this rq so just requeue it. */
1582
if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1583
blk_mq_requeue_request(rq, false);
1584
else
1585
ublk_end_request(rq, BLK_STS_IOERR);
1586
}
1587
1588
static void
1589
ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1590
{
1591
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1592
1593
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1594
}
1595
1596
enum auto_buf_reg_res {
1597
AUTO_BUF_REG_FAIL,
1598
AUTO_BUF_REG_FALLBACK,
1599
AUTO_BUF_REG_OK,
1600
};
1601
1602
/*
1603
* Setup io state after auto buffer registration.
1604
*
1605
* Must be called after ublk_auto_buf_register() is done.
1606
* Caller must hold io->lock in batch context.
1607
*/
1608
static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1609
struct request *req, struct ublk_io *io,
1610
struct io_uring_cmd *cmd,
1611
enum auto_buf_reg_res res)
1612
{
1613
if (res == AUTO_BUF_REG_OK) {
1614
io->task_registered_buffers = 1;
1615
io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1616
io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1617
}
1618
ublk_init_req_ref(ubq, io);
1619
__ublk_prep_compl_io_cmd(io, req);
1620
}
1621
1622
/* Register request bvec to io_uring for auto buffer registration. */
1623
static enum auto_buf_reg_res
1624
ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1625
struct ublk_io *io, struct io_uring_cmd *cmd,
1626
unsigned int issue_flags)
1627
{
1628
int ret;
1629
1630
ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1631
io->buf.auto_reg.index, issue_flags);
1632
if (ret) {
1633
if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1634
ublk_auto_buf_reg_fallback(ubq, req->tag);
1635
return AUTO_BUF_REG_FALLBACK;
1636
}
1637
ublk_end_request(req, BLK_STS_IOERR);
1638
return AUTO_BUF_REG_FAIL;
1639
}
1640
1641
return AUTO_BUF_REG_OK;
1642
}
1643
1644
/*
1645
* Dispatch IO to userspace with auto buffer registration.
1646
*
1647
* Only called in non-batch context from task work, io->lock not held.
1648
*/
1649
static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1650
struct request *req, struct ublk_io *io,
1651
struct io_uring_cmd *cmd,
1652
unsigned int issue_flags)
1653
{
1654
enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1655
issue_flags);
1656
1657
if (res != AUTO_BUF_REG_FAIL) {
1658
ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1659
io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1660
}
1661
}
1662
1663
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1664
struct ublk_io *io)
1665
{
1666
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1667
1668
/* partially mapped, update io descriptor */
1669
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1670
/*
1671
* Nothing mapped, retry until we succeed.
1672
*
1673
* We may never succeed in mapping any bytes here because
1674
* of OOM. TODO: reserve one buffer with single page pinned
1675
* for providing forward progress guarantee.
1676
*/
1677
if (unlikely(!mapped_bytes)) {
1678
blk_mq_requeue_request(req, false);
1679
blk_mq_delay_kick_requeue_list(req->q,
1680
UBLK_REQUEUE_DELAY_MS);
1681
return false;
1682
}
1683
1684
ublk_get_iod(ubq, req->tag)->nr_sectors =
1685
mapped_bytes >> 9;
1686
}
1687
1688
return true;
1689
}
1690
1691
static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1692
{
1693
unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1694
int tag = req->tag;
1695
struct ublk_io *io = &ubq->ios[tag];
1696
1697
pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1698
__func__, ubq->q_id, req->tag, io->flags,
1699
ublk_get_iod(ubq, req->tag)->addr);
1700
1701
/*
1702
* Task is exiting if either:
1703
*
1704
* (1) current != io->task.
1705
* io_uring_cmd_complete_in_task() tries to run task_work
1706
* in a workqueue if cmd's task is PF_EXITING.
1707
*
1708
* (2) current->flags & PF_EXITING.
1709
*/
1710
if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1711
__ublk_abort_rq(ubq, req);
1712
return;
1713
}
1714
1715
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1716
/*
1717
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
1718
* so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1719
* and notify it.
1720
*/
1721
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1722
pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1723
__func__, ubq->q_id, req->tag, io->flags);
1724
ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1725
issue_flags);
1726
return;
1727
}
1728
1729
if (!ublk_start_io(ubq, req, io))
1730
return;
1731
1732
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1733
ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1734
} else {
1735
ublk_init_req_ref(ubq, io);
1736
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1737
}
1738
}
1739
1740
static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1741
const struct ublk_batch_io_data *data,
1742
unsigned short tag)
1743
{
1744
struct ublk_device *ub = data->ub;
1745
struct ublk_io *io = &ubq->ios[tag];
1746
struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1747
enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1748
struct io_uring_cmd *cmd = data->cmd;
1749
1750
if (!ublk_start_io(ubq, req, io))
1751
return false;
1752
1753
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1754
res = ublk_auto_buf_register(ubq, req, io, cmd,
1755
data->issue_flags);
1756
1757
if (res == AUTO_BUF_REG_FAIL)
1758
return false;
1759
}
1760
1761
ublk_io_lock(io);
1762
ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1763
ublk_io_unlock(io);
1764
1765
return true;
1766
}
1767
1768
static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1769
const struct ublk_batch_io_data *data,
1770
unsigned short *tag_buf,
1771
unsigned int len)
1772
{
1773
bool has_unused = false;
1774
unsigned int i;
1775
1776
for (i = 0; i < len; i++) {
1777
unsigned short tag = tag_buf[i];
1778
1779
if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1780
tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1781
has_unused = true;
1782
}
1783
}
1784
1785
return has_unused;
1786
}
1787
1788
/*
1789
* Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1790
* Returns the new length after filtering.
1791
*/
1792
static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1793
unsigned int len)
1794
{
1795
unsigned int i, j;
1796
1797
for (i = 0, j = 0; i < len; i++) {
1798
if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1799
if (i != j)
1800
tag_buf[j] = tag_buf[i];
1801
j++;
1802
}
1803
}
1804
1805
return j;
1806
}
1807
1808
#define MAX_NR_TAG 128
1809
static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1810
const struct ublk_batch_io_data *data,
1811
struct ublk_batch_fetch_cmd *fcmd)
1812
{
1813
const unsigned int tag_sz = sizeof(unsigned short);
1814
unsigned short tag_buf[MAX_NR_TAG];
1815
struct io_br_sel sel;
1816
size_t len = 0;
1817
bool needs_filter;
1818
int ret;
1819
1820
WARN_ON_ONCE(data->cmd != fcmd->cmd);
1821
1822
sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1823
data->issue_flags);
1824
if (sel.val < 0)
1825
return sel.val;
1826
if (!sel.addr)
1827
return -ENOBUFS;
1828
1829
/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1830
len = min(len, sizeof(tag_buf)) / tag_sz;
1831
len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1832
1833
needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1834
/* Filter out unused tags before posting to userspace */
1835
if (unlikely(needs_filter)) {
1836
int new_len = ublk_filter_unused_tags(tag_buf, len);
1837
1838
/* return actual length if all are failed or requeued */
1839
if (!new_len) {
1840
/* release the selected buffer */
1841
sel.val = 0;
1842
WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1843
&sel, data->issue_flags));
1844
return len;
1845
}
1846
len = new_len;
1847
}
1848
1849
sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1850
ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1851
if (unlikely(ret < 0)) {
1852
int i, res;
1853
1854
/*
1855
* Undo prep state for all IOs since userspace never received them.
1856
* This restores IOs to pre-prepared state so they can be cleanly
1857
* re-prepared when tags are pulled from FIFO again.
1858
*/
1859
for (i = 0; i < len; i++) {
1860
struct ublk_io *io = &ubq->ios[tag_buf[i]];
1861
int index = -1;
1862
1863
ublk_io_lock(io);
1864
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1865
index = io->buf.auto_reg.index;
1866
io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1867
io->flags |= UBLK_IO_FLAG_ACTIVE;
1868
ublk_io_unlock(io);
1869
1870
if (index != -1)
1871
io_buffer_unregister_bvec(data->cmd, index,
1872
data->issue_flags);
1873
}
1874
1875
res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1876
tag_buf, len, &ubq->evts_lock);
1877
1878
pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1879
"tags(%d %zu) ret %d\n", __func__, res, len,
1880
ret);
1881
}
1882
return ret;
1883
}
1884
1885
static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1886
struct ublk_queue *ubq)
1887
{
1888
struct ublk_batch_fetch_cmd *fcmd;
1889
1890
lockdep_assert_held(&ubq->evts_lock);
1891
1892
/*
1893
* Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1894
*
1895
* The pair is the smp_mb() in ublk_batch_dispatch().
1896
*
1897
* If ubq->active_fcmd is observed as non-NULL, the new added tags
1898
* can be visisible in ublk_batch_dispatch() with the barrier pairing.
1899
*/
1900
smp_mb();
1901
if (READ_ONCE(ubq->active_fcmd)) {
1902
fcmd = NULL;
1903
} else {
1904
fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1905
struct ublk_batch_fetch_cmd, node);
1906
WRITE_ONCE(ubq->active_fcmd, fcmd);
1907
}
1908
return fcmd;
1909
}
1910
1911
static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1912
{
1913
unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1914
struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1915
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1916
struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1917
struct ublk_batch_io_data data = {
1918
.ub = pdu->ubq->dev,
1919
.cmd = fcmd->cmd,
1920
.issue_flags = issue_flags,
1921
};
1922
1923
WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1924
1925
ublk_batch_dispatch(pdu->ubq, &data, fcmd);
1926
}
1927
1928
static void
1929
ublk_batch_dispatch(struct ublk_queue *ubq,
1930
const struct ublk_batch_io_data *data,
1931
struct ublk_batch_fetch_cmd *fcmd)
1932
{
1933
struct ublk_batch_fetch_cmd *new_fcmd;
1934
unsigned tried = 0;
1935
int ret = 0;
1936
1937
again:
1938
while (!ublk_io_evts_empty(ubq)) {
1939
ret = __ublk_batch_dispatch(ubq, data, fcmd);
1940
if (ret <= 0)
1941
break;
1942
}
1943
1944
if (ret < 0) {
1945
ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
1946
return;
1947
}
1948
1949
__ublk_release_fcmd(ubq);
1950
/*
1951
* Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
1952
* checking ubq->evts_fifo.
1953
*
1954
* The pair is the smp_mb() in __ublk_acquire_fcmd().
1955
*/
1956
smp_mb();
1957
if (likely(ublk_io_evts_empty(ubq)))
1958
return;
1959
1960
spin_lock(&ubq->evts_lock);
1961
new_fcmd = __ublk_acquire_fcmd(ubq);
1962
spin_unlock(&ubq->evts_lock);
1963
1964
if (!new_fcmd)
1965
return;
1966
1967
/* Avoid lockup by allowing to handle at most 32 batches */
1968
if (new_fcmd == fcmd && tried++ < 32)
1969
goto again;
1970
1971
io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
1972
}
1973
1974
static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1975
{
1976
struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1977
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1978
struct ublk_queue *ubq = pdu->ubq;
1979
1980
ublk_dispatch_req(ubq, pdu->req);
1981
}
1982
1983
static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
1984
{
1985
unsigned short tag = rq->tag;
1986
struct ublk_batch_fetch_cmd *fcmd = NULL;
1987
1988
spin_lock(&ubq->evts_lock);
1989
kfifo_put(&ubq->evts_fifo, tag);
1990
if (last)
1991
fcmd = __ublk_acquire_fcmd(ubq);
1992
spin_unlock(&ubq->evts_lock);
1993
1994
if (fcmd)
1995
io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
1996
}
1997
1998
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1999
{
2000
struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2001
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2002
2003
pdu->req = rq;
2004
io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2005
}
2006
2007
static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2008
{
2009
struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2010
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2011
struct request *rq = pdu->req_list;
2012
struct request *next;
2013
2014
do {
2015
next = rq->rq_next;
2016
rq->rq_next = NULL;
2017
ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2018
rq = next;
2019
} while (rq);
2020
}
2021
2022
static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2023
{
2024
struct io_uring_cmd *cmd = io->cmd;
2025
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2026
2027
pdu->req_list = rq_list_peek(l);
2028
rq_list_init(l);
2029
io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2030
}
2031
2032
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2033
{
2034
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2035
pid_t tgid = ubq->dev->ublksrv_tgid;
2036
struct task_struct *p;
2037
struct pid *pid;
2038
2039
if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2040
return BLK_EH_RESET_TIMER;
2041
2042
if (unlikely(!tgid))
2043
return BLK_EH_RESET_TIMER;
2044
2045
rcu_read_lock();
2046
pid = find_vpid(tgid);
2047
p = pid_task(pid, PIDTYPE_PID);
2048
if (p)
2049
send_sig(SIGKILL, p, 0);
2050
rcu_read_unlock();
2051
return BLK_EH_DONE;
2052
}
2053
2054
static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2055
bool check_cancel)
2056
{
2057
blk_status_t res;
2058
2059
if (unlikely(READ_ONCE(ubq->fail_io)))
2060
return BLK_STS_TARGET;
2061
2062
/* With recovery feature enabled, force_abort is set in
2063
* ublk_stop_dev() before calling del_gendisk(). We have to
2064
* abort all requeued and new rqs here to let del_gendisk()
2065
* move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2066
* to avoid UAF on io_uring ctx.
2067
*
2068
* Note: force_abort is guaranteed to be seen because it is set
2069
* before request queue is unqiuesced.
2070
*/
2071
if (ublk_nosrv_should_queue_io(ubq) &&
2072
unlikely(READ_ONCE(ubq->force_abort)))
2073
return BLK_STS_IOERR;
2074
2075
if (check_cancel && unlikely(ubq->canceling))
2076
return BLK_STS_IOERR;
2077
2078
/* fill iod to slot in io cmd buffer */
2079
res = ublk_setup_iod(ubq, rq);
2080
if (unlikely(res != BLK_STS_OK))
2081
return BLK_STS_IOERR;
2082
2083
blk_mq_start_request(rq);
2084
return BLK_STS_OK;
2085
}
2086
2087
/*
2088
* Common helper for queue_rq that handles request preparation and
2089
* cancellation checks. Returns status and sets should_queue to indicate
2090
* whether the caller should proceed with queuing the request.
2091
*/
2092
static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2093
struct request *rq,
2094
bool *should_queue)
2095
{
2096
blk_status_t res;
2097
2098
res = ublk_prep_req(ubq, rq, false);
2099
if (res != BLK_STS_OK) {
2100
*should_queue = false;
2101
return res;
2102
}
2103
2104
/*
2105
* ->canceling has to be handled after ->force_abort and ->fail_io
2106
* is dealt with, otherwise this request may not be failed in case
2107
* of recovery, and cause hang when deleting disk
2108
*/
2109
if (unlikely(ubq->canceling)) {
2110
*should_queue = false;
2111
__ublk_abort_rq(ubq, rq);
2112
return BLK_STS_OK;
2113
}
2114
2115
*should_queue = true;
2116
return BLK_STS_OK;
2117
}
2118
2119
static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2120
const struct blk_mq_queue_data *bd)
2121
{
2122
struct ublk_queue *ubq = hctx->driver_data;
2123
struct request *rq = bd->rq;
2124
bool should_queue;
2125
blk_status_t res;
2126
2127
res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2128
if (!should_queue)
2129
return res;
2130
2131
ublk_queue_cmd(ubq, rq);
2132
return BLK_STS_OK;
2133
}
2134
2135
static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2136
const struct blk_mq_queue_data *bd)
2137
{
2138
struct ublk_queue *ubq = hctx->driver_data;
2139
struct request *rq = bd->rq;
2140
bool should_queue;
2141
blk_status_t res;
2142
2143
res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2144
if (!should_queue)
2145
return res;
2146
2147
ublk_batch_queue_cmd(ubq, rq, bd->last);
2148
return BLK_STS_OK;
2149
}
2150
2151
static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2152
const struct ublk_io *io2)
2153
{
2154
return (io_uring_cmd_ctx_handle(io->cmd) ==
2155
io_uring_cmd_ctx_handle(io2->cmd)) &&
2156
(io->task == io2->task);
2157
}
2158
2159
static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2160
{
2161
struct ublk_queue *ubq = hctx->driver_data;
2162
struct ublk_batch_fetch_cmd *fcmd;
2163
2164
spin_lock(&ubq->evts_lock);
2165
fcmd = __ublk_acquire_fcmd(ubq);
2166
spin_unlock(&ubq->evts_lock);
2167
2168
if (fcmd)
2169
io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2170
}
2171
2172
static void ublk_queue_rqs(struct rq_list *rqlist)
2173
{
2174
struct rq_list requeue_list = { };
2175
struct rq_list submit_list = { };
2176
struct ublk_io *io = NULL;
2177
struct request *req;
2178
2179
while ((req = rq_list_pop(rqlist))) {
2180
struct ublk_queue *this_q = req->mq_hctx->driver_data;
2181
struct ublk_io *this_io = &this_q->ios[req->tag];
2182
2183
if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2184
rq_list_add_tail(&requeue_list, req);
2185
continue;
2186
}
2187
2188
if (io && !ublk_belong_to_same_batch(io, this_io) &&
2189
!rq_list_empty(&submit_list))
2190
ublk_queue_cmd_list(io, &submit_list);
2191
io = this_io;
2192
rq_list_add_tail(&submit_list, req);
2193
}
2194
2195
if (!rq_list_empty(&submit_list))
2196
ublk_queue_cmd_list(io, &submit_list);
2197
*rqlist = requeue_list;
2198
}
2199
2200
static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2201
{
2202
unsigned short tags[MAX_NR_TAG];
2203
struct ublk_batch_fetch_cmd *fcmd;
2204
struct request *rq;
2205
unsigned cnt = 0;
2206
2207
spin_lock(&ubq->evts_lock);
2208
rq_list_for_each(l, rq) {
2209
tags[cnt++] = (unsigned short)rq->tag;
2210
if (cnt >= MAX_NR_TAG) {
2211
kfifo_in(&ubq->evts_fifo, tags, cnt);
2212
cnt = 0;
2213
}
2214
}
2215
if (cnt)
2216
kfifo_in(&ubq->evts_fifo, tags, cnt);
2217
fcmd = __ublk_acquire_fcmd(ubq);
2218
spin_unlock(&ubq->evts_lock);
2219
2220
rq_list_init(l);
2221
if (fcmd)
2222
io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2223
}
2224
2225
static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2226
{
2227
struct rq_list requeue_list = { };
2228
struct rq_list submit_list = { };
2229
struct ublk_queue *ubq = NULL;
2230
struct request *req;
2231
2232
while ((req = rq_list_pop(rqlist))) {
2233
struct ublk_queue *this_q = req->mq_hctx->driver_data;
2234
2235
if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2236
rq_list_add_tail(&requeue_list, req);
2237
continue;
2238
}
2239
2240
if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2241
ublk_batch_queue_cmd_list(ubq, &submit_list);
2242
ubq = this_q;
2243
rq_list_add_tail(&submit_list, req);
2244
}
2245
2246
if (!rq_list_empty(&submit_list))
2247
ublk_batch_queue_cmd_list(ubq, &submit_list);
2248
*rqlist = requeue_list;
2249
}
2250
2251
static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2252
unsigned int hctx_idx)
2253
{
2254
struct ublk_device *ub = driver_data;
2255
struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2256
2257
hctx->driver_data = ubq;
2258
return 0;
2259
}
2260
2261
static const struct blk_mq_ops ublk_mq_ops = {
2262
.queue_rq = ublk_queue_rq,
2263
.queue_rqs = ublk_queue_rqs,
2264
.init_hctx = ublk_init_hctx,
2265
.timeout = ublk_timeout,
2266
};
2267
2268
static const struct blk_mq_ops ublk_batch_mq_ops = {
2269
.commit_rqs = ublk_commit_rqs,
2270
.queue_rq = ublk_batch_queue_rq,
2271
.queue_rqs = ublk_batch_queue_rqs,
2272
.init_hctx = ublk_init_hctx,
2273
.timeout = ublk_timeout,
2274
};
2275
2276
static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2277
{
2278
int i;
2279
2280
ubq->nr_io_ready = 0;
2281
2282
for (i = 0; i < ubq->q_depth; i++) {
2283
struct ublk_io *io = &ubq->ios[i];
2284
2285
/*
2286
* UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2287
* io->cmd
2288
*/
2289
io->flags &= UBLK_IO_FLAG_CANCELED;
2290
io->cmd = NULL;
2291
io->buf.addr = 0;
2292
2293
/*
2294
* old task is PF_EXITING, put it now
2295
*
2296
* It could be NULL in case of closing one quiesced
2297
* device.
2298
*/
2299
if (io->task) {
2300
put_task_struct(io->task);
2301
io->task = NULL;
2302
}
2303
2304
WARN_ON_ONCE(refcount_read(&io->ref));
2305
WARN_ON_ONCE(io->task_registered_buffers);
2306
}
2307
}
2308
2309
static int ublk_ch_open(struct inode *inode, struct file *filp)
2310
{
2311
struct ublk_device *ub = container_of(inode->i_cdev,
2312
struct ublk_device, cdev);
2313
2314
if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2315
return -EBUSY;
2316
filp->private_data = ub;
2317
ub->ublksrv_tgid = current->tgid;
2318
return 0;
2319
}
2320
2321
static void ublk_reset_ch_dev(struct ublk_device *ub)
2322
{
2323
int i;
2324
2325
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2326
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2327
2328
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2329
ub->mm = NULL;
2330
ub->nr_queue_ready = 0;
2331
ub->unprivileged_daemons = false;
2332
ub->ublksrv_tgid = -1;
2333
}
2334
2335
static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2336
{
2337
struct gendisk *disk;
2338
2339
spin_lock(&ub->lock);
2340
disk = ub->ub_disk;
2341
if (disk)
2342
get_device(disk_to_dev(disk));
2343
spin_unlock(&ub->lock);
2344
2345
return disk;
2346
}
2347
2348
static void ublk_put_disk(struct gendisk *disk)
2349
{
2350
if (disk)
2351
put_device(disk_to_dev(disk));
2352
}
2353
2354
static void ublk_partition_scan_work(struct work_struct *work)
2355
{
2356
struct ublk_device *ub =
2357
container_of(work, struct ublk_device, partition_scan_work);
2358
/* Hold disk reference to prevent UAF during concurrent teardown */
2359
struct gendisk *disk = ublk_get_disk(ub);
2360
2361
if (!disk)
2362
return;
2363
2364
if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2365
&disk->state)))
2366
goto out;
2367
2368
mutex_lock(&disk->open_mutex);
2369
bdev_disk_changed(disk, false);
2370
mutex_unlock(&disk->open_mutex);
2371
out:
2372
ublk_put_disk(disk);
2373
}
2374
2375
/*
2376
* Use this function to ensure that ->canceling is consistently set for
2377
* the device and all queues. Do not set these flags directly.
2378
*
2379
* Caller must ensure that:
2380
* - cancel_mutex is held. This ensures that there is no concurrent
2381
* access to ub->canceling and no concurrent writes to ubq->canceling.
2382
* - there are no concurrent reads of ubq->canceling from the queue_rq
2383
* path. This can be done by quiescing the queue, or through other
2384
* means.
2385
*/
2386
static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2387
__must_hold(&ub->cancel_mutex)
2388
{
2389
int i;
2390
2391
ub->canceling = canceling;
2392
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2393
ublk_get_queue(ub, i)->canceling = canceling;
2394
}
2395
2396
static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2397
{
2398
int i, j;
2399
2400
if (!ublk_dev_need_req_ref(ub))
2401
return false;
2402
2403
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2404
struct ublk_queue *ubq = ublk_get_queue(ub, i);
2405
2406
for (j = 0; j < ubq->q_depth; j++) {
2407
struct ublk_io *io = &ubq->ios[j];
2408
unsigned int refs = refcount_read(&io->ref) +
2409
io->task_registered_buffers;
2410
2411
/*
2412
* UBLK_REFCOUNT_INIT or zero means no active
2413
* reference
2414
*/
2415
if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2416
return true;
2417
2418
/* reset to zero if the io hasn't active references */
2419
refcount_set(&io->ref, 0);
2420
io->task_registered_buffers = 0;
2421
}
2422
}
2423
return false;
2424
}
2425
2426
static void ublk_ch_release_work_fn(struct work_struct *work)
2427
{
2428
struct ublk_device *ub =
2429
container_of(work, struct ublk_device, exit_work.work);
2430
struct gendisk *disk;
2431
int i;
2432
2433
/*
2434
* For zero-copy and auto buffer register modes, I/O references
2435
* might not be dropped naturally when the daemon is killed, but
2436
* io_uring guarantees that registered bvec kernel buffers are
2437
* unregistered finally when freeing io_uring context, then the
2438
* active references are dropped.
2439
*
2440
* Wait until active references are dropped for avoiding use-after-free
2441
*
2442
* registered buffer may be unregistered in io_ring's release hander,
2443
* so have to wait by scheduling work function for avoiding the two
2444
* file release dependency.
2445
*/
2446
if (ublk_check_and_reset_active_ref(ub)) {
2447
schedule_delayed_work(&ub->exit_work, 1);
2448
return;
2449
}
2450
2451
/*
2452
* disk isn't attached yet, either device isn't live, or it has
2453
* been removed already, so we needn't to do anything
2454
*/
2455
disk = ublk_get_disk(ub);
2456
if (!disk)
2457
goto out;
2458
2459
/*
2460
* All uring_cmd are done now, so abort any request outstanding to
2461
* the ublk server
2462
*
2463
* This can be done in lockless way because ublk server has been
2464
* gone
2465
*
2466
* More importantly, we have to provide forward progress guarantee
2467
* without holding ub->mutex, otherwise control task grabbing
2468
* ub->mutex triggers deadlock
2469
*
2470
* All requests may be inflight, so ->canceling may not be set, set
2471
* it now.
2472
*/
2473
mutex_lock(&ub->cancel_mutex);
2474
ublk_set_canceling(ub, true);
2475
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2476
ublk_abort_queue(ub, ublk_get_queue(ub, i));
2477
mutex_unlock(&ub->cancel_mutex);
2478
blk_mq_kick_requeue_list(disk->queue);
2479
2480
/*
2481
* All infligh requests have been completed or requeued and any new
2482
* request will be failed or requeued via `->canceling` now, so it is
2483
* fine to grab ub->mutex now.
2484
*/
2485
mutex_lock(&ub->mutex);
2486
2487
/* double check after grabbing lock */
2488
if (!ub->ub_disk)
2489
goto unlock;
2490
2491
/*
2492
* Transition the device to the nosrv state. What exactly this
2493
* means depends on the recovery flags
2494
*/
2495
if (ublk_nosrv_should_stop_dev(ub)) {
2496
/*
2497
* Allow any pending/future I/O to pass through quickly
2498
* with an error. This is needed because del_gendisk
2499
* waits for all pending I/O to complete
2500
*/
2501
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2502
WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2503
2504
ublk_stop_dev_unlocked(ub);
2505
} else {
2506
if (ublk_nosrv_dev_should_queue_io(ub)) {
2507
/* ->canceling is set and all requests are aborted */
2508
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2509
} else {
2510
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2511
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2512
WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2513
}
2514
}
2515
unlock:
2516
mutex_unlock(&ub->mutex);
2517
ublk_put_disk(disk);
2518
2519
/* all uring_cmd has been done now, reset device & ubq */
2520
ublk_reset_ch_dev(ub);
2521
out:
2522
clear_bit(UB_STATE_OPEN, &ub->state);
2523
2524
/* put the reference grabbed in ublk_ch_release() */
2525
ublk_put_device(ub);
2526
}
2527
2528
static int ublk_ch_release(struct inode *inode, struct file *filp)
2529
{
2530
struct ublk_device *ub = filp->private_data;
2531
2532
/*
2533
* Grab ublk device reference, so it won't be gone until we are
2534
* really released from work function.
2535
*/
2536
ublk_get_device(ub);
2537
2538
INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2539
schedule_delayed_work(&ub->exit_work, 0);
2540
return 0;
2541
}
2542
2543
/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
2544
static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2545
{
2546
struct ublk_device *ub = filp->private_data;
2547
size_t sz = vma->vm_end - vma->vm_start;
2548
unsigned max_sz = ublk_max_cmd_buf_size();
2549
unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2550
int q_id, ret = 0;
2551
2552
spin_lock(&ub->lock);
2553
if (!ub->mm)
2554
ub->mm = current->mm;
2555
if (current->mm != ub->mm)
2556
ret = -EINVAL;
2557
spin_unlock(&ub->lock);
2558
2559
if (ret)
2560
return ret;
2561
2562
if (vma->vm_flags & VM_WRITE)
2563
return -EPERM;
2564
2565
end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2566
if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2567
return -EINVAL;
2568
2569
q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2570
pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2571
__func__, q_id, current->pid, vma->vm_start,
2572
phys_off, (unsigned long)sz);
2573
2574
if (sz != ublk_queue_cmd_buf_size(ub))
2575
return -EINVAL;
2576
2577
pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2578
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2579
}
2580
2581
static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2582
struct request *req)
2583
{
2584
WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2585
io->flags & UBLK_IO_FLAG_ACTIVE);
2586
2587
if (ublk_nosrv_should_reissue_outstanding(ub))
2588
blk_mq_requeue_request(req, false);
2589
else {
2590
io->res = -EIO;
2591
__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2592
}
2593
}
2594
2595
/*
2596
* Request tag may just be filled to event kfifo, not get chance to
2597
* dispatch, abort these requests too
2598
*/
2599
static void ublk_abort_batch_queue(struct ublk_device *ub,
2600
struct ublk_queue *ubq)
2601
{
2602
unsigned short tag;
2603
2604
while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2605
struct request *req = blk_mq_tag_to_rq(
2606
ub->tag_set.tags[ubq->q_id], tag);
2607
2608
if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2609
__ublk_fail_req(ub, &ubq->ios[tag], req);
2610
}
2611
}
2612
2613
/*
2614
* Called from ublk char device release handler, when any uring_cmd is
2615
* done, meantime request queue is "quiesced" since all inflight requests
2616
* can't be completed because ublk server is dead.
2617
*
2618
* So no one can hold our request IO reference any more, simply ignore the
2619
* reference, and complete the request immediately
2620
*/
2621
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2622
{
2623
int i;
2624
2625
for (i = 0; i < ubq->q_depth; i++) {
2626
struct ublk_io *io = &ubq->ios[i];
2627
2628
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2629
__ublk_fail_req(ub, io, io->req);
2630
}
2631
2632
if (ublk_support_batch_io(ubq))
2633
ublk_abort_batch_queue(ub, ubq);
2634
}
2635
2636
static void ublk_start_cancel(struct ublk_device *ub)
2637
{
2638
struct gendisk *disk = ublk_get_disk(ub);
2639
2640
/* Our disk has been dead */
2641
if (!disk)
2642
return;
2643
2644
mutex_lock(&ub->cancel_mutex);
2645
if (ub->canceling)
2646
goto out;
2647
/*
2648
* Now we are serialized with ublk_queue_rq()
2649
*
2650
* Make sure that ubq->canceling is set when queue is frozen,
2651
* because ublk_queue_rq() has to rely on this flag for avoiding to
2652
* touch completed uring_cmd
2653
*/
2654
blk_mq_quiesce_queue(disk->queue);
2655
ublk_set_canceling(ub, true);
2656
blk_mq_unquiesce_queue(disk->queue);
2657
out:
2658
mutex_unlock(&ub->cancel_mutex);
2659
ublk_put_disk(disk);
2660
}
2661
2662
static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2663
unsigned int issue_flags)
2664
{
2665
struct ublk_io *io = &ubq->ios[tag];
2666
struct ublk_device *ub = ubq->dev;
2667
struct request *req;
2668
bool done;
2669
2670
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2671
return;
2672
2673
/*
2674
* Don't try to cancel this command if the request is started for
2675
* avoiding race between io_uring_cmd_done() and
2676
* io_uring_cmd_complete_in_task().
2677
*
2678
* Either the started request will be aborted via __ublk_abort_rq(),
2679
* then this uring_cmd is canceled next time, or it will be done in
2680
* task work function ublk_dispatch_req() because io_uring guarantees
2681
* that ublk_dispatch_req() is always called
2682
*/
2683
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2684
if (req && blk_mq_request_started(req) && req->tag == tag)
2685
return;
2686
2687
spin_lock(&ubq->cancel_lock);
2688
done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2689
if (!done)
2690
io->flags |= UBLK_IO_FLAG_CANCELED;
2691
spin_unlock(&ubq->cancel_lock);
2692
2693
if (!done)
2694
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2695
}
2696
2697
/*
2698
* Cancel a batch fetch command if it hasn't been claimed by another path.
2699
*
2700
* An fcmd can only be cancelled if:
2701
* 1. It's not the active_fcmd (which is currently being processed)
2702
* 2. It's still on the list (!list_empty check) - once removed from the list,
2703
* the fcmd is considered claimed and will be freed by whoever removed it
2704
*
2705
* Use list_del_init() so subsequent list_empty() checks work correctly.
2706
*/
2707
static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2708
struct ublk_batch_fetch_cmd *fcmd,
2709
unsigned int issue_flags)
2710
{
2711
bool done;
2712
2713
spin_lock(&ubq->evts_lock);
2714
done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2715
if (done)
2716
list_del_init(&fcmd->node);
2717
spin_unlock(&ubq->evts_lock);
2718
2719
if (done) {
2720
io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2721
ublk_batch_free_fcmd(fcmd);
2722
}
2723
}
2724
2725
static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2726
{
2727
struct ublk_batch_fetch_cmd *fcmd;
2728
LIST_HEAD(fcmd_list);
2729
2730
spin_lock(&ubq->evts_lock);
2731
ubq->force_abort = true;
2732
list_splice_init(&ubq->fcmd_head, &fcmd_list);
2733
fcmd = READ_ONCE(ubq->active_fcmd);
2734
if (fcmd)
2735
list_move(&fcmd->node, &ubq->fcmd_head);
2736
spin_unlock(&ubq->evts_lock);
2737
2738
while (!list_empty(&fcmd_list)) {
2739
fcmd = list_first_entry(&fcmd_list,
2740
struct ublk_batch_fetch_cmd, node);
2741
ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2742
}
2743
}
2744
2745
static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2746
unsigned int issue_flags)
2747
{
2748
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2749
struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2750
struct ublk_queue *ubq = pdu->ubq;
2751
2752
ublk_start_cancel(ubq->dev);
2753
2754
ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2755
}
2756
2757
/*
2758
* The ublk char device won't be closed when calling cancel fn, so both
2759
* ublk device and queue are guaranteed to be live
2760
*
2761
* Two-stage cancel:
2762
*
2763
* - make every active uring_cmd done in ->cancel_fn()
2764
*
2765
* - aborting inflight ublk IO requests in ublk char device release handler,
2766
* which depends on 1st stage because device can only be closed iff all
2767
* uring_cmd are done
2768
*
2769
* Do _not_ try to acquire ub->mutex before all inflight requests are
2770
* aborted, otherwise deadlock may be caused.
2771
*/
2772
static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2773
unsigned int issue_flags)
2774
{
2775
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2776
struct ublk_queue *ubq = pdu->ubq;
2777
struct task_struct *task;
2778
struct ublk_io *io;
2779
2780
if (WARN_ON_ONCE(!ubq))
2781
return;
2782
2783
if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2784
return;
2785
2786
task = io_uring_cmd_get_task(cmd);
2787
io = &ubq->ios[pdu->tag];
2788
if (WARN_ON_ONCE(task && task != io->task))
2789
return;
2790
2791
ublk_start_cancel(ubq->dev);
2792
2793
WARN_ON_ONCE(io->cmd != cmd);
2794
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2795
}
2796
2797
static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2798
{
2799
return ubq->nr_io_ready == ubq->q_depth;
2800
}
2801
2802
static inline bool ublk_dev_ready(const struct ublk_device *ub)
2803
{
2804
return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2805
}
2806
2807
static void ublk_cancel_queue(struct ublk_queue *ubq)
2808
{
2809
int i;
2810
2811
if (ublk_support_batch_io(ubq)) {
2812
ublk_batch_cancel_queue(ubq);
2813
return;
2814
}
2815
2816
for (i = 0; i < ubq->q_depth; i++)
2817
ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2818
}
2819
2820
/* Cancel all pending commands, must be called after del_gendisk() returns */
2821
static void ublk_cancel_dev(struct ublk_device *ub)
2822
{
2823
int i;
2824
2825
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2826
ublk_cancel_queue(ublk_get_queue(ub, i));
2827
}
2828
2829
static bool ublk_check_inflight_rq(struct request *rq, void *data)
2830
{
2831
bool *idle = data;
2832
2833
if (blk_mq_request_started(rq)) {
2834
*idle = false;
2835
return false;
2836
}
2837
return true;
2838
}
2839
2840
static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2841
{
2842
bool idle;
2843
2844
WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2845
while (true) {
2846
idle = true;
2847
blk_mq_tagset_busy_iter(&ub->tag_set,
2848
ublk_check_inflight_rq, &idle);
2849
if (idle)
2850
break;
2851
msleep(UBLK_REQUEUE_DELAY_MS);
2852
}
2853
}
2854
2855
static void ublk_force_abort_dev(struct ublk_device *ub)
2856
{
2857
int i;
2858
2859
pr_devel("%s: force abort ub: dev_id %d state %s\n",
2860
__func__, ub->dev_info.dev_id,
2861
ub->dev_info.state == UBLK_S_DEV_LIVE ?
2862
"LIVE" : "QUIESCED");
2863
blk_mq_quiesce_queue(ub->ub_disk->queue);
2864
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2865
ublk_wait_tagset_rqs_idle(ub);
2866
2867
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2868
ublk_get_queue(ub, i)->force_abort = true;
2869
blk_mq_unquiesce_queue(ub->ub_disk->queue);
2870
/* We may have requeued some rqs in ublk_quiesce_queue() */
2871
blk_mq_kick_requeue_list(ub->ub_disk->queue);
2872
}
2873
2874
static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2875
{
2876
struct gendisk *disk;
2877
2878
/* Sync with ublk_abort_queue() by holding the lock */
2879
spin_lock(&ub->lock);
2880
disk = ub->ub_disk;
2881
ub->dev_info.state = UBLK_S_DEV_DEAD;
2882
ub->dev_info.ublksrv_pid = -1;
2883
ub->ub_disk = NULL;
2884
spin_unlock(&ub->lock);
2885
2886
return disk;
2887
}
2888
2889
static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2890
__must_hold(&ub->mutex)
2891
{
2892
struct gendisk *disk;
2893
2894
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2895
return;
2896
2897
if (ublk_nosrv_dev_should_queue_io(ub))
2898
ublk_force_abort_dev(ub);
2899
del_gendisk(ub->ub_disk);
2900
disk = ublk_detach_disk(ub);
2901
put_disk(disk);
2902
}
2903
2904
static void ublk_stop_dev(struct ublk_device *ub)
2905
{
2906
mutex_lock(&ub->mutex);
2907
ublk_stop_dev_unlocked(ub);
2908
mutex_unlock(&ub->mutex);
2909
cancel_work_sync(&ub->partition_scan_work);
2910
ublk_cancel_dev(ub);
2911
}
2912
2913
/* reset per-queue io flags */
2914
static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2915
{
2916
int j;
2917
2918
/* UBLK_IO_FLAG_CANCELED can be cleared now */
2919
spin_lock(&ubq->cancel_lock);
2920
for (j = 0; j < ubq->q_depth; j++)
2921
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2922
ubq->canceling = false;
2923
spin_unlock(&ubq->cancel_lock);
2924
ubq->fail_io = false;
2925
}
2926
2927
/* device can only be started after all IOs are ready */
2928
static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
2929
__must_hold(&ub->mutex)
2930
{
2931
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2932
2933
if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2934
ub->unprivileged_daemons = true;
2935
2936
ubq->nr_io_ready++;
2937
2938
/* Check if this specific queue is now fully ready */
2939
if (ublk_queue_ready(ubq)) {
2940
ub->nr_queue_ready++;
2941
2942
/*
2943
* Reset queue flags as soon as this queue is ready.
2944
* This clears the canceling flag, allowing batch FETCH commands
2945
* to succeed during recovery without waiting for all queues.
2946
*/
2947
ublk_queue_reset_io_flags(ubq);
2948
}
2949
2950
/* Check if all queues are ready */
2951
if (ublk_dev_ready(ub)) {
2952
/*
2953
* All queues ready - clear device-level canceling flag
2954
* and complete the recovery/initialization.
2955
*/
2956
mutex_lock(&ub->cancel_mutex);
2957
ub->canceling = false;
2958
mutex_unlock(&ub->cancel_mutex);
2959
complete_all(&ub->completion);
2960
}
2961
}
2962
2963
static inline int ublk_check_cmd_op(u32 cmd_op)
2964
{
2965
u32 ioc_type = _IOC_TYPE(cmd_op);
2966
2967
if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2968
return -EOPNOTSUPP;
2969
2970
if (ioc_type != 'u' && ioc_type != 0)
2971
return -EOPNOTSUPP;
2972
2973
return 0;
2974
}
2975
2976
static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2977
{
2978
struct ublk_auto_buf_reg buf;
2979
2980
buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2981
2982
if (buf.reserved0 || buf.reserved1)
2983
return -EINVAL;
2984
2985
if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2986
return -EINVAL;
2987
io->buf.auto_reg = buf;
2988
return 0;
2989
}
2990
2991
static void ublk_clear_auto_buf_reg(struct ublk_io *io,
2992
struct io_uring_cmd *cmd,
2993
u16 *buf_idx)
2994
{
2995
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
2996
io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
2997
2998
/*
2999
* `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3000
* and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3001
* `io_ring_ctx`.
3002
*
3003
* If this uring_cmd's io_ring_ctx isn't same with the
3004
* one for registering the buffer, it is ublk server's
3005
* responsibility for unregistering the buffer, otherwise
3006
* this ublk request gets stuck.
3007
*/
3008
if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3009
*buf_idx = io->buf.auto_reg.index;
3010
}
3011
}
3012
3013
static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3014
struct io_uring_cmd *cmd,
3015
u16 *buf_idx)
3016
{
3017
ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3018
return ublk_set_auto_buf_reg(io, cmd);
3019
}
3020
3021
/* Once we return, `io->req` can't be used any more */
3022
static inline struct request *
3023
ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3024
{
3025
struct request *req = io->req;
3026
3027
io->cmd = cmd;
3028
io->flags |= UBLK_IO_FLAG_ACTIVE;
3029
/* now this cmd slot is owned by ublk driver */
3030
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3031
3032
return req;
3033
}
3034
3035
static inline int
3036
ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3037
struct io_uring_cmd *cmd, unsigned long buf_addr,
3038
u16 *buf_idx)
3039
{
3040
if (ublk_dev_support_auto_buf_reg(ub))
3041
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3042
3043
io->buf.addr = buf_addr;
3044
return 0;
3045
}
3046
3047
static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3048
unsigned int issue_flags,
3049
struct ublk_queue *ubq, unsigned int tag)
3050
{
3051
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3052
3053
/*
3054
* Safe to refer to @ubq since ublk_queue won't be died until its
3055
* commands are completed
3056
*/
3057
pdu->ubq = ubq;
3058
pdu->tag = tag;
3059
io_uring_cmd_mark_cancelable(cmd, issue_flags);
3060
}
3061
3062
static void ublk_io_release(void *priv)
3063
{
3064
struct request *rq = priv;
3065
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3066
struct ublk_io *io = &ubq->ios[rq->tag];
3067
3068
/*
3069
* task_registered_buffers may be 0 if buffers were registered off task
3070
* but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3071
*/
3072
if (current == io->task && io->task_registered_buffers)
3073
io->task_registered_buffers--;
3074
else
3075
ublk_put_req_ref(io, rq);
3076
}
3077
3078
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3079
struct ublk_device *ub,
3080
u16 q_id, u16 tag,
3081
struct ublk_io *io,
3082
unsigned int index, unsigned int issue_flags)
3083
{
3084
struct request *req;
3085
int ret;
3086
3087
if (!ublk_dev_support_zero_copy(ub))
3088
return -EINVAL;
3089
3090
req = __ublk_check_and_get_req(ub, q_id, tag, io);
3091
if (!req)
3092
return -EINVAL;
3093
3094
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3095
issue_flags);
3096
if (ret) {
3097
ublk_put_req_ref(io, req);
3098
return ret;
3099
}
3100
3101
return 0;
3102
}
3103
3104
static int
3105
ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3106
struct ublk_device *ub,
3107
u16 q_id, u16 tag, struct ublk_io *io,
3108
unsigned index, unsigned issue_flags)
3109
{
3110
unsigned new_registered_buffers;
3111
struct request *req = io->req;
3112
int ret;
3113
3114
/*
3115
* Ensure there are still references for ublk_sub_req_ref() to release.
3116
* If not, fall back on the thread-safe buffer registration.
3117
*/
3118
new_registered_buffers = io->task_registered_buffers + 1;
3119
if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3120
return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3121
issue_flags);
3122
3123
if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3124
return -EINVAL;
3125
3126
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3127
issue_flags);
3128
if (ret)
3129
return ret;
3130
3131
io->task_registered_buffers = new_registered_buffers;
3132
return 0;
3133
}
3134
3135
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3136
const struct ublk_device *ub,
3137
unsigned int index, unsigned int issue_flags)
3138
{
3139
if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3140
return -EINVAL;
3141
3142
return io_buffer_unregister_bvec(cmd, index, issue_flags);
3143
}
3144
3145
static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3146
{
3147
if (ublk_dev_need_map_io(ub)) {
3148
/*
3149
* FETCH_RQ has to provide IO buffer if NEED GET
3150
* DATA is not enabled
3151
*/
3152
if (!buf_addr && !ublk_dev_need_get_data(ub))
3153
return -EINVAL;
3154
} else if (buf_addr) {
3155
/* User copy requires addr to be unset */
3156
return -EINVAL;
3157
}
3158
return 0;
3159
}
3160
3161
static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3162
struct ublk_io *io, u16 q_id)
3163
{
3164
/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3165
if (ublk_dev_ready(ub))
3166
return -EBUSY;
3167
3168
/* allow each command to be FETCHed at most once */
3169
if (io->flags & UBLK_IO_FLAG_ACTIVE)
3170
return -EINVAL;
3171
3172
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3173
3174
ublk_fill_io_cmd(io, cmd);
3175
3176
if (ublk_dev_support_batch_io(ub))
3177
WRITE_ONCE(io->task, NULL);
3178
else
3179
WRITE_ONCE(io->task, get_task_struct(current));
3180
3181
return 0;
3182
}
3183
3184
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3185
struct ublk_io *io, __u64 buf_addr, u16 q_id)
3186
{
3187
int ret;
3188
3189
/*
3190
* When handling FETCH command for setting up ublk uring queue,
3191
* ub->mutex is the innermost lock, and we won't block for handling
3192
* FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3193
*/
3194
mutex_lock(&ub->mutex);
3195
ret = __ublk_fetch(cmd, ub, io, q_id);
3196
if (!ret)
3197
ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3198
if (!ret)
3199
ublk_mark_io_ready(ub, q_id);
3200
mutex_unlock(&ub->mutex);
3201
return ret;
3202
}
3203
3204
static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3205
struct ublk_io *io, __u64 buf_addr)
3206
{
3207
struct request *req = io->req;
3208
3209
if (ublk_dev_need_map_io(ub)) {
3210
/*
3211
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
3212
* NEED GET DATA is not enabled or it is Read IO.
3213
*/
3214
if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3215
req_op(req) == REQ_OP_READ))
3216
return -EINVAL;
3217
} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3218
/*
3219
* User copy requires addr to be unset when command is
3220
* not zone append
3221
*/
3222
return -EINVAL;
3223
}
3224
3225
return 0;
3226
}
3227
3228
static bool ublk_need_complete_req(const struct ublk_device *ub,
3229
struct ublk_io *io)
3230
{
3231
if (ublk_dev_need_req_ref(ub))
3232
return ublk_sub_req_ref(io);
3233
return true;
3234
}
3235
3236
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3237
struct request *req)
3238
{
3239
/*
3240
* We have handled UBLK_IO_NEED_GET_DATA command,
3241
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3242
* do the copy work.
3243
*/
3244
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3245
/* update iod->addr because ublksrv may have passed a new io buffer */
3246
ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3247
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3248
__func__, ubq->q_id, req->tag, io->flags,
3249
ublk_get_iod(ubq, req->tag)->addr);
3250
3251
return ublk_start_io(ubq, req, io);
3252
}
3253
3254
static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3255
unsigned int issue_flags)
3256
{
3257
/* May point to userspace-mapped memory */
3258
const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
3259
u16 buf_idx = UBLK_INVALID_BUF_IDX;
3260
struct ublk_device *ub = cmd->file->private_data;
3261
struct ublk_queue *ubq;
3262
struct ublk_io *io = NULL;
3263
u32 cmd_op = cmd->cmd_op;
3264
u16 q_id = READ_ONCE(ub_src->q_id);
3265
u16 tag = READ_ONCE(ub_src->tag);
3266
s32 result = READ_ONCE(ub_src->result);
3267
u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3268
struct request *req;
3269
int ret;
3270
bool compl;
3271
3272
WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3273
3274
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3275
__func__, cmd->cmd_op, q_id, tag, result);
3276
3277
ret = ublk_check_cmd_op(cmd_op);
3278
if (ret)
3279
goto out;
3280
3281
/*
3282
* io_buffer_unregister_bvec() doesn't access the ubq or io,
3283
* so no need to validate the q_id, tag, or task
3284
*/
3285
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3286
return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3287
3288
ret = -EINVAL;
3289
if (q_id >= ub->dev_info.nr_hw_queues)
3290
goto out;
3291
3292
ubq = ublk_get_queue(ub, q_id);
3293
3294
if (tag >= ub->dev_info.queue_depth)
3295
goto out;
3296
3297
io = &ubq->ios[tag];
3298
/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3299
if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3300
ret = ublk_check_fetch_buf(ub, addr);
3301
if (ret)
3302
goto out;
3303
ret = ublk_fetch(cmd, ub, io, addr, q_id);
3304
if (ret)
3305
goto out;
3306
3307
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3308
return -EIOCBQUEUED;
3309
}
3310
3311
if (READ_ONCE(io->task) != current) {
3312
/*
3313
* ublk_register_io_buf() accesses only the io's refcount,
3314
* so can be handled on any task
3315
*/
3316
if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3317
return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3318
addr, issue_flags);
3319
3320
goto out;
3321
}
3322
3323
/* there is pending io cmd, something must be wrong */
3324
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3325
ret = -EBUSY;
3326
goto out;
3327
}
3328
3329
/*
3330
* ensure that the user issues UBLK_IO_NEED_GET_DATA
3331
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3332
*/
3333
if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3334
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3335
goto out;
3336
3337
switch (_IOC_NR(cmd_op)) {
3338
case UBLK_IO_REGISTER_IO_BUF:
3339
return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3340
issue_flags);
3341
case UBLK_IO_COMMIT_AND_FETCH_REQ:
3342
ret = ublk_check_commit_and_fetch(ub, io, addr);
3343
if (ret)
3344
goto out;
3345
io->res = result;
3346
req = ublk_fill_io_cmd(io, cmd);
3347
ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3348
if (buf_idx != UBLK_INVALID_BUF_IDX)
3349
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3350
compl = ublk_need_complete_req(ub, io);
3351
3352
if (req_op(req) == REQ_OP_ZONE_APPEND)
3353
req->__sector = addr;
3354
if (compl)
3355
__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3356
3357
if (ret)
3358
goto out;
3359
break;
3360
case UBLK_IO_NEED_GET_DATA:
3361
/*
3362
* ublk_get_data() may fail and fallback to requeue, so keep
3363
* uring_cmd active first and prepare for handling new requeued
3364
* request
3365
*/
3366
req = ublk_fill_io_cmd(io, cmd);
3367
ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3368
WARN_ON_ONCE(ret);
3369
if (likely(ublk_get_data(ubq, io, req))) {
3370
__ublk_prep_compl_io_cmd(io, req);
3371
return UBLK_IO_RES_OK;
3372
}
3373
break;
3374
default:
3375
goto out;
3376
}
3377
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3378
return -EIOCBQUEUED;
3379
3380
out:
3381
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3382
__func__, cmd_op, tag, ret, io ? io->flags : 0);
3383
return ret;
3384
}
3385
3386
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3387
u16 q_id, u16 tag, struct ublk_io *io)
3388
{
3389
struct request *req;
3390
3391
/*
3392
* can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3393
* which would overwrite it with io->cmd
3394
*/
3395
req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3396
if (!req)
3397
return NULL;
3398
3399
if (!ublk_get_req_ref(io))
3400
return NULL;
3401
3402
if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3403
goto fail_put;
3404
3405
if (!ublk_rq_has_data(req))
3406
goto fail_put;
3407
3408
return req;
3409
fail_put:
3410
ublk_put_req_ref(io, req);
3411
return NULL;
3412
}
3413
3414
static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3415
{
3416
unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3417
struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3418
int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3419
3420
if (ret != -EIOCBQUEUED)
3421
io_uring_cmd_done(cmd, ret, issue_flags);
3422
}
3423
3424
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3425
{
3426
if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3427
ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3428
return 0;
3429
}
3430
3431
/* well-implemented server won't run into unlocked */
3432
if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3433
io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3434
return -EIOCBQUEUED;
3435
}
3436
3437
return ublk_ch_uring_cmd_local(cmd, issue_flags);
3438
}
3439
3440
static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3441
const struct ublk_elem_header *elem)
3442
{
3443
const void *buf = elem;
3444
3445
if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3446
return *(const __u64 *)(buf + sizeof(*elem));
3447
return 0;
3448
}
3449
3450
static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3451
const struct ublk_elem_header *elem)
3452
{
3453
const void *buf = elem;
3454
3455
if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3456
return *(const __u64 *)(buf + sizeof(*elem) +
3457
8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3458
return -1;
3459
}
3460
3461
static struct ublk_auto_buf_reg
3462
ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3463
const struct ublk_elem_header *elem)
3464
{
3465
struct ublk_auto_buf_reg reg = {
3466
.index = elem->buf_index,
3467
.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3468
UBLK_AUTO_BUF_REG_FALLBACK : 0,
3469
};
3470
3471
return reg;
3472
}
3473
3474
/*
3475
* 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3476
* it is the least common multiple(LCM) of 8, 16 and 24
3477
*/
3478
#define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10)
3479
struct ublk_batch_io_iter {
3480
void __user *uaddr;
3481
unsigned done, total;
3482
unsigned char elem_bytes;
3483
/* copy to this buffer from user space */
3484
unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3485
};
3486
3487
static inline int
3488
__ublk_walk_cmd_buf(struct ublk_queue *ubq,
3489
struct ublk_batch_io_iter *iter,
3490
const struct ublk_batch_io_data *data,
3491
unsigned bytes,
3492
int (*cb)(struct ublk_queue *q,
3493
const struct ublk_batch_io_data *data,
3494
const struct ublk_elem_header *elem))
3495
{
3496
unsigned int i;
3497
int ret = 0;
3498
3499
for (i = 0; i < bytes; i += iter->elem_bytes) {
3500
const struct ublk_elem_header *elem =
3501
(const struct ublk_elem_header *)&iter->buf[i];
3502
3503
if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3504
ret = -EINVAL;
3505
break;
3506
}
3507
3508
ret = cb(ubq, data, elem);
3509
if (unlikely(ret))
3510
break;
3511
}
3512
3513
iter->done += i;
3514
return ret;
3515
}
3516
3517
static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3518
const struct ublk_batch_io_data *data,
3519
int (*cb)(struct ublk_queue *q,
3520
const struct ublk_batch_io_data *data,
3521
const struct ublk_elem_header *elem))
3522
{
3523
struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3524
int ret = 0;
3525
3526
while (iter->done < iter->total) {
3527
unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3528
3529
if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3530
pr_warn("ublk%d: read batch cmd buffer failed\n",
3531
data->ub->dev_info.dev_id);
3532
return -EFAULT;
3533
}
3534
3535
ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3536
if (ret)
3537
return ret;
3538
}
3539
return 0;
3540
}
3541
3542
static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3543
const struct ublk_batch_io_data *data,
3544
const struct ublk_elem_header *elem)
3545
{
3546
struct ublk_io *io = &ubq->ios[elem->tag];
3547
3548
/*
3549
* If queue was ready before this decrement, it won't be anymore,
3550
* so we need to decrement the queue ready count and restore the
3551
* canceling flag to prevent new requests from being queued.
3552
*/
3553
if (ublk_queue_ready(ubq)) {
3554
data->ub->nr_queue_ready--;
3555
spin_lock(&ubq->cancel_lock);
3556
ubq->canceling = true;
3557
spin_unlock(&ubq->cancel_lock);
3558
}
3559
ubq->nr_io_ready--;
3560
3561
ublk_io_lock(io);
3562
io->flags = 0;
3563
ublk_io_unlock(io);
3564
return 0;
3565
}
3566
3567
static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3568
const struct ublk_batch_io_data *data)
3569
{
3570
int ret;
3571
3572
/* Re-process only what we've already processed, starting from beginning */
3573
iter->total = iter->done;
3574
iter->done = 0;
3575
3576
ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3577
WARN_ON_ONCE(ret);
3578
}
3579
3580
static int ublk_batch_prep_io(struct ublk_queue *ubq,
3581
const struct ublk_batch_io_data *data,
3582
const struct ublk_elem_header *elem)
3583
{
3584
struct ublk_io *io = &ubq->ios[elem->tag];
3585
const struct ublk_batch_io *uc = &data->header;
3586
union ublk_io_buf buf = { 0 };
3587
int ret;
3588
3589
if (ublk_dev_support_auto_buf_reg(data->ub))
3590
buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3591
else if (ublk_dev_need_map_io(data->ub)) {
3592
buf.addr = ublk_batch_buf_addr(uc, elem);
3593
3594
ret = ublk_check_fetch_buf(data->ub, buf.addr);
3595
if (ret)
3596
return ret;
3597
}
3598
3599
ublk_io_lock(io);
3600
ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3601
if (!ret)
3602
io->buf = buf;
3603
ublk_io_unlock(io);
3604
3605
if (!ret)
3606
ublk_mark_io_ready(data->ub, ubq->q_id);
3607
3608
return ret;
3609
}
3610
3611
static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3612
{
3613
const struct ublk_batch_io *uc = &data->header;
3614
struct io_uring_cmd *cmd = data->cmd;
3615
struct ublk_batch_io_iter iter = {
3616
.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3617
.total = uc->nr_elem * uc->elem_bytes,
3618
.elem_bytes = uc->elem_bytes,
3619
};
3620
int ret;
3621
3622
mutex_lock(&data->ub->mutex);
3623
ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3624
3625
if (ret && iter.done)
3626
ublk_batch_revert_prep_cmd(&iter, data);
3627
mutex_unlock(&data->ub->mutex);
3628
return ret;
3629
}
3630
3631
static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3632
struct ublk_io *io,
3633
union ublk_io_buf *buf)
3634
{
3635
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3636
return -EBUSY;
3637
3638
/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3639
if (ublk_need_map_io(ubq) && !buf->addr)
3640
return -EINVAL;
3641
return 0;
3642
}
3643
3644
static int ublk_batch_commit_io(struct ublk_queue *ubq,
3645
const struct ublk_batch_io_data *data,
3646
const struct ublk_elem_header *elem)
3647
{
3648
struct ublk_io *io = &ubq->ios[elem->tag];
3649
const struct ublk_batch_io *uc = &data->header;
3650
u16 buf_idx = UBLK_INVALID_BUF_IDX;
3651
union ublk_io_buf buf = { 0 };
3652
struct request *req = NULL;
3653
bool auto_reg = false;
3654
bool compl = false;
3655
int ret;
3656
3657
if (ublk_dev_support_auto_buf_reg(data->ub)) {
3658
buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3659
auto_reg = true;
3660
} else if (ublk_dev_need_map_io(data->ub))
3661
buf.addr = ublk_batch_buf_addr(uc, elem);
3662
3663
ublk_io_lock(io);
3664
ret = ublk_batch_commit_io_check(ubq, io, &buf);
3665
if (!ret) {
3666
io->res = elem->result;
3667
io->buf = buf;
3668
req = ublk_fill_io_cmd(io, data->cmd);
3669
3670
if (auto_reg)
3671
ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3672
compl = ublk_need_complete_req(data->ub, io);
3673
}
3674
ublk_io_unlock(io);
3675
3676
if (unlikely(ret)) {
3677
pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3678
__func__, data->ub->dev_info.dev_id, ubq->q_id,
3679
elem->tag, ret);
3680
return ret;
3681
}
3682
3683
if (buf_idx != UBLK_INVALID_BUF_IDX)
3684
io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3685
if (req_op(req) == REQ_OP_ZONE_APPEND)
3686
req->__sector = ublk_batch_zone_lba(uc, elem);
3687
if (compl)
3688
__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3689
return 0;
3690
}
3691
3692
static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3693
{
3694
const struct ublk_batch_io *uc = &data->header;
3695
struct io_uring_cmd *cmd = data->cmd;
3696
struct ublk_batch_io_iter iter = {
3697
.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3698
.total = uc->nr_elem * uc->elem_bytes,
3699
.elem_bytes = uc->elem_bytes,
3700
};
3701
DEFINE_IO_COMP_BATCH(iob);
3702
int ret;
3703
3704
data->iob = &iob;
3705
ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3706
3707
if (iob.complete)
3708
iob.complete(&iob);
3709
3710
return iter.done == 0 ? ret : iter.done;
3711
}
3712
3713
static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3714
{
3715
unsigned elem_bytes = sizeof(struct ublk_elem_header);
3716
3717
if (uc->flags & ~UBLK_BATCH_F_ALL)
3718
return -EINVAL;
3719
3720
/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3721
if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3722
(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3723
return -EINVAL;
3724
3725
elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3726
(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3727
if (uc->elem_bytes != elem_bytes)
3728
return -EINVAL;
3729
return 0;
3730
}
3731
3732
static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3733
{
3734
const struct ublk_batch_io *uc = &data->header;
3735
3736
if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3737
return -EINVAL;
3738
3739
if (uc->nr_elem > data->ub->dev_info.queue_depth)
3740
return -E2BIG;
3741
3742
if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3743
!ublk_dev_is_zoned(data->ub))
3744
return -EINVAL;
3745
3746
if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3747
!ublk_dev_need_map_io(data->ub))
3748
return -EINVAL;
3749
3750
if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3751
!ublk_dev_support_auto_buf_reg(data->ub))
3752
return -EINVAL;
3753
3754
return ublk_check_batch_cmd_flags(uc);
3755
}
3756
3757
static int ublk_batch_attach(struct ublk_queue *ubq,
3758
struct ublk_batch_io_data *data,
3759
struct ublk_batch_fetch_cmd *fcmd)
3760
{
3761
struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3762
bool free = false;
3763
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3764
3765
spin_lock(&ubq->evts_lock);
3766
if (unlikely(ubq->force_abort || ubq->canceling)) {
3767
free = true;
3768
} else {
3769
list_add_tail(&fcmd->node, &ubq->fcmd_head);
3770
new_fcmd = __ublk_acquire_fcmd(ubq);
3771
}
3772
spin_unlock(&ubq->evts_lock);
3773
3774
if (unlikely(free)) {
3775
ublk_batch_free_fcmd(fcmd);
3776
return -ENODEV;
3777
}
3778
3779
pdu->ubq = ubq;
3780
pdu->fcmd = fcmd;
3781
io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3782
3783
if (!new_fcmd)
3784
goto out;
3785
3786
/*
3787
* If the two fetch commands are originated from same io_ring_ctx,
3788
* run batch dispatch directly. Otherwise, schedule task work for
3789
* doing it.
3790
*/
3791
if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3792
io_uring_cmd_ctx_handle(fcmd->cmd)) {
3793
data->cmd = new_fcmd->cmd;
3794
ublk_batch_dispatch(ubq, data, new_fcmd);
3795
} else {
3796
io_uring_cmd_complete_in_task(new_fcmd->cmd,
3797
ublk_batch_tw_cb);
3798
}
3799
out:
3800
return -EIOCBQUEUED;
3801
}
3802
3803
static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3804
{
3805
struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3806
struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3807
3808
if (!fcmd)
3809
return -ENOMEM;
3810
3811
return ublk_batch_attach(ubq, data, fcmd);
3812
}
3813
3814
static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3815
{
3816
const struct ublk_batch_io *uc = &data->header;
3817
3818
if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3819
return -EINVAL;
3820
3821
if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3822
return -EINVAL;
3823
3824
if (uc->elem_bytes != sizeof(__u16))
3825
return -EINVAL;
3826
3827
if (uc->flags != 0)
3828
return -EINVAL;
3829
3830
return 0;
3831
}
3832
3833
static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3834
unsigned int issue_flags)
3835
{
3836
const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe);
3837
struct ublk_device *ub = cmd->file->private_data;
3838
unsigned tag = READ_ONCE(ub_cmd->tag);
3839
unsigned q_id = READ_ONCE(ub_cmd->q_id);
3840
unsigned index = READ_ONCE(ub_cmd->addr);
3841
struct ublk_queue *ubq;
3842
struct ublk_io *io;
3843
3844
if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3845
return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3846
3847
if (q_id >= ub->dev_info.nr_hw_queues)
3848
return -EINVAL;
3849
3850
if (tag >= ub->dev_info.queue_depth)
3851
return -EINVAL;
3852
3853
if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3854
return -EOPNOTSUPP;
3855
3856
ubq = ublk_get_queue(ub, q_id);
3857
io = &ubq->ios[tag];
3858
return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3859
issue_flags);
3860
}
3861
3862
static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3863
unsigned int issue_flags)
3864
{
3865
const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe);
3866
struct ublk_device *ub = cmd->file->private_data;
3867
struct ublk_batch_io_data data = {
3868
.ub = ub,
3869
.cmd = cmd,
3870
.header = (struct ublk_batch_io) {
3871
.q_id = READ_ONCE(uc->q_id),
3872
.flags = READ_ONCE(uc->flags),
3873
.nr_elem = READ_ONCE(uc->nr_elem),
3874
.elem_bytes = READ_ONCE(uc->elem_bytes),
3875
},
3876
.issue_flags = issue_flags,
3877
};
3878
u32 cmd_op = cmd->cmd_op;
3879
int ret = -EINVAL;
3880
3881
if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3882
ublk_batch_cancel_fn(cmd, issue_flags);
3883
return 0;
3884
}
3885
3886
switch (cmd_op) {
3887
case UBLK_U_IO_PREP_IO_CMDS:
3888
ret = ublk_check_batch_cmd(&data);
3889
if (ret)
3890
goto out;
3891
ret = ublk_handle_batch_prep_cmd(&data);
3892
break;
3893
case UBLK_U_IO_COMMIT_IO_CMDS:
3894
ret = ublk_check_batch_cmd(&data);
3895
if (ret)
3896
goto out;
3897
ret = ublk_handle_batch_commit_cmd(&data);
3898
break;
3899
case UBLK_U_IO_FETCH_IO_CMDS:
3900
ret = ublk_validate_batch_fetch_cmd(&data);
3901
if (ret)
3902
goto out;
3903
ret = ublk_handle_batch_fetch_cmd(&data);
3904
break;
3905
default:
3906
ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3907
break;
3908
}
3909
out:
3910
return ret;
3911
}
3912
3913
static inline bool ublk_check_ubuf_dir(const struct request *req,
3914
int ubuf_dir)
3915
{
3916
/* copy ubuf to request pages */
3917
if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
3918
ubuf_dir == ITER_SOURCE)
3919
return true;
3920
3921
/* copy request pages to ubuf */
3922
if ((req_op(req) == REQ_OP_WRITE ||
3923
req_op(req) == REQ_OP_ZONE_APPEND) &&
3924
ubuf_dir == ITER_DEST)
3925
return true;
3926
3927
return false;
3928
}
3929
3930
static ssize_t
3931
ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
3932
{
3933
struct ublk_device *ub = iocb->ki_filp->private_data;
3934
struct ublk_queue *ubq;
3935
struct request *req;
3936
struct ublk_io *io;
3937
unsigned data_len;
3938
bool is_integrity;
3939
bool on_daemon;
3940
size_t buf_off;
3941
u16 tag, q_id;
3942
ssize_t ret;
3943
3944
if (!user_backed_iter(iter))
3945
return -EACCES;
3946
3947
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3948
return -EACCES;
3949
3950
tag = ublk_pos_to_tag(iocb->ki_pos);
3951
q_id = ublk_pos_to_hwq(iocb->ki_pos);
3952
buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
3953
is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
3954
3955
if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
3956
return -EINVAL;
3957
3958
if (q_id >= ub->dev_info.nr_hw_queues)
3959
return -EINVAL;
3960
3961
ubq = ublk_get_queue(ub, q_id);
3962
if (!ublk_dev_support_user_copy(ub))
3963
return -EACCES;
3964
3965
if (tag >= ub->dev_info.queue_depth)
3966
return -EINVAL;
3967
3968
io = &ubq->ios[tag];
3969
on_daemon = current == READ_ONCE(io->task);
3970
if (on_daemon) {
3971
/* On daemon, io can't be completed concurrently, so skip ref */
3972
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3973
return -EINVAL;
3974
3975
req = io->req;
3976
if (!ublk_rq_has_data(req))
3977
return -EINVAL;
3978
} else {
3979
req = __ublk_check_and_get_req(ub, q_id, tag, io);
3980
if (!req)
3981
return -EINVAL;
3982
}
3983
3984
if (is_integrity) {
3985
struct blk_integrity *bi = &req->q->limits.integrity;
3986
3987
data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
3988
} else {
3989
data_len = blk_rq_bytes(req);
3990
}
3991
if (buf_off > data_len) {
3992
ret = -EINVAL;
3993
goto out;
3994
}
3995
3996
if (!ublk_check_ubuf_dir(req, dir)) {
3997
ret = -EACCES;
3998
goto out;
3999
}
4000
4001
if (is_integrity)
4002
ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4003
else
4004
ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4005
4006
out:
4007
if (!on_daemon)
4008
ublk_put_req_ref(io, req);
4009
return ret;
4010
}
4011
4012
static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4013
{
4014
return ublk_user_copy(iocb, to, ITER_DEST);
4015
}
4016
4017
static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4018
{
4019
return ublk_user_copy(iocb, from, ITER_SOURCE);
4020
}
4021
4022
static const struct file_operations ublk_ch_fops = {
4023
.owner = THIS_MODULE,
4024
.open = ublk_ch_open,
4025
.release = ublk_ch_release,
4026
.read_iter = ublk_ch_read_iter,
4027
.write_iter = ublk_ch_write_iter,
4028
.uring_cmd = ublk_ch_uring_cmd,
4029
.mmap = ublk_ch_mmap,
4030
};
4031
4032
static const struct file_operations ublk_ch_batch_io_fops = {
4033
.owner = THIS_MODULE,
4034
.open = ublk_ch_open,
4035
.release = ublk_ch_release,
4036
.read_iter = ublk_ch_read_iter,
4037
.write_iter = ublk_ch_write_iter,
4038
.uring_cmd = ublk_ch_batch_io_uring_cmd,
4039
.mmap = ublk_ch_mmap,
4040
};
4041
4042
static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4043
{
4044
int size, i;
4045
4046
size = ublk_queue_cmd_buf_size(ub);
4047
4048
for (i = 0; i < ubq->q_depth; i++) {
4049
struct ublk_io *io = &ubq->ios[i];
4050
if (io->task)
4051
put_task_struct(io->task);
4052
WARN_ON_ONCE(refcount_read(&io->ref));
4053
WARN_ON_ONCE(io->task_registered_buffers);
4054
}
4055
4056
if (ubq->io_cmd_buf)
4057
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4058
4059
if (ublk_dev_support_batch_io(ub))
4060
ublk_io_evts_deinit(ubq);
4061
4062
kvfree(ubq);
4063
}
4064
4065
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4066
{
4067
struct ublk_queue *ubq = ub->queues[q_id];
4068
4069
if (!ubq)
4070
return;
4071
4072
__ublk_deinit_queue(ub, ubq);
4073
ub->queues[q_id] = NULL;
4074
}
4075
4076
static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4077
{
4078
unsigned int cpu;
4079
4080
/* Find first CPU mapped to this queue */
4081
for_each_possible_cpu(cpu) {
4082
if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4083
return cpu_to_node(cpu);
4084
}
4085
4086
return NUMA_NO_NODE;
4087
}
4088
4089
static int ublk_init_queue(struct ublk_device *ub, int q_id)
4090
{
4091
int depth = ub->dev_info.queue_depth;
4092
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4093
struct ublk_queue *ubq;
4094
struct page *page;
4095
int numa_node;
4096
int size, i, ret;
4097
4098
/* Determine NUMA node based on queue's CPU affinity */
4099
numa_node = ublk_get_queue_numa_node(ub, q_id);
4100
4101
/* Allocate queue structure on local NUMA node */
4102
ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4103
numa_node);
4104
if (!ubq)
4105
return -ENOMEM;
4106
4107
spin_lock_init(&ubq->cancel_lock);
4108
ubq->flags = ub->dev_info.flags;
4109
ubq->q_id = q_id;
4110
ubq->q_depth = depth;
4111
size = ublk_queue_cmd_buf_size(ub);
4112
4113
/* Allocate I/O command buffer on local NUMA node */
4114
page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4115
if (!page) {
4116
kvfree(ubq);
4117
return -ENOMEM;
4118
}
4119
ubq->io_cmd_buf = page_address(page);
4120
4121
for (i = 0; i < ubq->q_depth; i++)
4122
spin_lock_init(&ubq->ios[i].lock);
4123
4124
if (ublk_dev_support_batch_io(ub)) {
4125
ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4126
if (ret)
4127
goto fail;
4128
INIT_LIST_HEAD(&ubq->fcmd_head);
4129
}
4130
ub->queues[q_id] = ubq;
4131
ubq->dev = ub;
4132
4133
return 0;
4134
fail:
4135
__ublk_deinit_queue(ub, ubq);
4136
return ret;
4137
}
4138
4139
static void ublk_deinit_queues(struct ublk_device *ub)
4140
{
4141
int i;
4142
4143
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4144
ublk_deinit_queue(ub, i);
4145
}
4146
4147
static int ublk_init_queues(struct ublk_device *ub)
4148
{
4149
int i, ret;
4150
4151
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4152
ret = ublk_init_queue(ub, i);
4153
if (ret)
4154
goto fail;
4155
}
4156
4157
init_completion(&ub->completion);
4158
return 0;
4159
4160
fail:
4161
ublk_deinit_queues(ub);
4162
return ret;
4163
}
4164
4165
static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4166
{
4167
int i = idx;
4168
int err;
4169
4170
spin_lock(&ublk_idr_lock);
4171
/* allocate id, if @id >= 0, we're requesting that specific id */
4172
if (i >= 0) {
4173
err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4174
if (err == -ENOSPC)
4175
err = -EEXIST;
4176
} else {
4177
err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4178
GFP_NOWAIT);
4179
}
4180
spin_unlock(&ublk_idr_lock);
4181
4182
if (err >= 0)
4183
ub->ub_number = err;
4184
4185
return err;
4186
}
4187
4188
static void ublk_free_dev_number(struct ublk_device *ub)
4189
{
4190
spin_lock(&ublk_idr_lock);
4191
idr_remove(&ublk_index_idr, ub->ub_number);
4192
wake_up_all(&ublk_idr_wq);
4193
spin_unlock(&ublk_idr_lock);
4194
}
4195
4196
static void ublk_cdev_rel(struct device *dev)
4197
{
4198
struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4199
4200
blk_mq_free_tag_set(&ub->tag_set);
4201
ublk_deinit_queues(ub);
4202
ublk_free_dev_number(ub);
4203
mutex_destroy(&ub->mutex);
4204
mutex_destroy(&ub->cancel_mutex);
4205
kfree(ub);
4206
}
4207
4208
static int ublk_add_chdev(struct ublk_device *ub)
4209
{
4210
struct device *dev = &ub->cdev_dev;
4211
int minor = ub->ub_number;
4212
int ret;
4213
4214
dev->parent = ublk_misc.this_device;
4215
dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4216
dev->class = &ublk_chr_class;
4217
dev->release = ublk_cdev_rel;
4218
device_initialize(dev);
4219
4220
ret = dev_set_name(dev, "ublkc%d", minor);
4221
if (ret)
4222
goto fail;
4223
4224
if (ublk_dev_support_batch_io(ub))
4225
cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4226
else
4227
cdev_init(&ub->cdev, &ublk_ch_fops);
4228
ret = cdev_device_add(&ub->cdev, dev);
4229
if (ret)
4230
goto fail;
4231
4232
if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4233
unprivileged_ublks_added++;
4234
return 0;
4235
fail:
4236
put_device(dev);
4237
return ret;
4238
}
4239
4240
/* align max io buffer size with PAGE_SIZE */
4241
static void ublk_align_max_io_size(struct ublk_device *ub)
4242
{
4243
unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4244
4245
ub->dev_info.max_io_buf_bytes =
4246
round_down(max_io_bytes, PAGE_SIZE);
4247
}
4248
4249
static int ublk_add_tag_set(struct ublk_device *ub)
4250
{
4251
if (ublk_dev_support_batch_io(ub))
4252
ub->tag_set.ops = &ublk_batch_mq_ops;
4253
else
4254
ub->tag_set.ops = &ublk_mq_ops;
4255
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4256
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4257
ub->tag_set.numa_node = NUMA_NO_NODE;
4258
ub->tag_set.driver_data = ub;
4259
return blk_mq_alloc_tag_set(&ub->tag_set);
4260
}
4261
4262
static void ublk_remove(struct ublk_device *ub)
4263
{
4264
bool unprivileged;
4265
4266
ublk_stop_dev(ub);
4267
cdev_device_del(&ub->cdev, &ub->cdev_dev);
4268
unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4269
ublk_put_device(ub);
4270
4271
if (unprivileged)
4272
unprivileged_ublks_added--;
4273
}
4274
4275
static struct ublk_device *ublk_get_device_from_id(int idx)
4276
{
4277
struct ublk_device *ub = NULL;
4278
4279
if (idx < 0)
4280
return NULL;
4281
4282
spin_lock(&ublk_idr_lock);
4283
ub = idr_find(&ublk_index_idr, idx);
4284
if (ub)
4285
ub = ublk_get_device(ub);
4286
spin_unlock(&ublk_idr_lock);
4287
4288
return ub;
4289
}
4290
4291
static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4292
{
4293
rcu_read_lock();
4294
ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4295
rcu_read_unlock();
4296
4297
return ub->ublksrv_tgid == ublksrv_pid;
4298
}
4299
4300
static int ublk_ctrl_start_dev(struct ublk_device *ub,
4301
const struct ublksrv_ctrl_cmd *header)
4302
{
4303
const struct ublk_param_basic *p = &ub->params.basic;
4304
int ublksrv_pid = (int)header->data[0];
4305
struct queue_limits lim = {
4306
.logical_block_size = 1 << p->logical_bs_shift,
4307
.physical_block_size = 1 << p->physical_bs_shift,
4308
.io_min = 1 << p->io_min_shift,
4309
.io_opt = 1 << p->io_opt_shift,
4310
.max_hw_sectors = p->max_sectors,
4311
.chunk_sectors = p->chunk_sectors,
4312
.virt_boundary_mask = p->virt_boundary_mask,
4313
.max_segments = USHRT_MAX,
4314
.max_segment_size = UINT_MAX,
4315
.dma_alignment = 3,
4316
};
4317
struct gendisk *disk;
4318
int ret = -EINVAL;
4319
4320
if (ublksrv_pid <= 0)
4321
return -EINVAL;
4322
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4323
return -EINVAL;
4324
4325
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4326
const struct ublk_param_discard *pd = &ub->params.discard;
4327
4328
lim.discard_alignment = pd->discard_alignment;
4329
lim.discard_granularity = pd->discard_granularity;
4330
lim.max_hw_discard_sectors = pd->max_discard_sectors;
4331
lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4332
lim.max_discard_segments = pd->max_discard_segments;
4333
}
4334
4335
if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4336
const struct ublk_param_zoned *p = &ub->params.zoned;
4337
4338
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4339
return -EOPNOTSUPP;
4340
4341
lim.features |= BLK_FEAT_ZONED;
4342
lim.max_active_zones = p->max_active_zones;
4343
lim.max_open_zones = p->max_open_zones;
4344
lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4345
}
4346
4347
if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4348
lim.features |= BLK_FEAT_WRITE_CACHE;
4349
if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4350
lim.features |= BLK_FEAT_FUA;
4351
}
4352
4353
if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4354
lim.features |= BLK_FEAT_ROTATIONAL;
4355
4356
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4357
lim.dma_alignment = ub->params.dma.alignment;
4358
4359
if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4360
lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4361
lim.max_segment_size = ub->params.seg.max_segment_size;
4362
lim.max_segments = ub->params.seg.max_segments;
4363
}
4364
4365
if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4366
const struct ublk_param_integrity *p = &ub->params.integrity;
4367
int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4368
4369
lim.max_integrity_segments =
4370
p->max_integrity_segments ?: USHRT_MAX;
4371
lim.integrity = (struct blk_integrity) {
4372
.flags = ublk_integrity_flags(p->flags),
4373
.csum_type = ublk_integrity_csum_type(p->csum_type),
4374
.metadata_size = p->metadata_size,
4375
.pi_offset = p->pi_offset,
4376
.interval_exp = p->interval_exp,
4377
.tag_size = p->tag_size,
4378
.pi_tuple_size = pi_tuple_size,
4379
};
4380
}
4381
4382
if (wait_for_completion_interruptible(&ub->completion) != 0)
4383
return -EINTR;
4384
4385
if (!ublk_validate_user_pid(ub, ublksrv_pid))
4386
return -EINVAL;
4387
4388
mutex_lock(&ub->mutex);
4389
/* device may become not ready in case of F_BATCH */
4390
if (!ublk_dev_ready(ub)) {
4391
ret = -EINVAL;
4392
goto out_unlock;
4393
}
4394
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4395
test_bit(UB_STATE_USED, &ub->state)) {
4396
ret = -EEXIST;
4397
goto out_unlock;
4398
}
4399
4400
disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4401
if (IS_ERR(disk)) {
4402
ret = PTR_ERR(disk);
4403
goto out_unlock;
4404
}
4405
sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4406
disk->fops = &ub_fops;
4407
disk->private_data = ub;
4408
4409
ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4410
ub->ub_disk = disk;
4411
4412
ublk_apply_params(ub);
4413
4414
/*
4415
* Suppress partition scan to avoid potential IO hang.
4416
*
4417
* If ublk server error occurs during partition scan, the IO may
4418
* wait while holding ub->mutex, which can deadlock with other
4419
* operations that need the mutex. Defer partition scan to async
4420
* work.
4421
* For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4422
* permanently.
4423
*/
4424
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4425
4426
ublk_get_device(ub);
4427
ub->dev_info.state = UBLK_S_DEV_LIVE;
4428
4429
if (ublk_dev_is_zoned(ub)) {
4430
ret = ublk_revalidate_disk_zones(ub);
4431
if (ret)
4432
goto out_put_cdev;
4433
}
4434
4435
ret = add_disk(disk);
4436
if (ret)
4437
goto out_put_cdev;
4438
4439
set_bit(UB_STATE_USED, &ub->state);
4440
4441
/* Skip partition scan if disabled by user */
4442
if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4443
clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4444
} else {
4445
/* Schedule async partition scan for trusted daemons */
4446
if (!ub->unprivileged_daemons)
4447
schedule_work(&ub->partition_scan_work);
4448
}
4449
4450
out_put_cdev:
4451
if (ret) {
4452
ublk_detach_disk(ub);
4453
ublk_put_device(ub);
4454
}
4455
if (ret)
4456
put_disk(disk);
4457
out_unlock:
4458
mutex_unlock(&ub->mutex);
4459
return ret;
4460
}
4461
4462
static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4463
const struct ublksrv_ctrl_cmd *header)
4464
{
4465
void __user *argp = (void __user *)(unsigned long)header->addr;
4466
cpumask_var_t cpumask;
4467
unsigned long queue;
4468
unsigned int retlen;
4469
unsigned int i;
4470
int ret;
4471
4472
if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4473
return -EINVAL;
4474
if (header->len & (sizeof(unsigned long)-1))
4475
return -EINVAL;
4476
if (!header->addr)
4477
return -EINVAL;
4478
4479
queue = header->data[0];
4480
if (queue >= ub->dev_info.nr_hw_queues)
4481
return -EINVAL;
4482
4483
if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4484
return -ENOMEM;
4485
4486
for_each_possible_cpu(i) {
4487
if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4488
cpumask_set_cpu(i, cpumask);
4489
}
4490
4491
ret = -EFAULT;
4492
retlen = min_t(unsigned short, header->len, cpumask_size());
4493
if (copy_to_user(argp, cpumask, retlen))
4494
goto out_free_cpumask;
4495
if (retlen != header->len &&
4496
clear_user(argp + retlen, header->len - retlen))
4497
goto out_free_cpumask;
4498
4499
ret = 0;
4500
out_free_cpumask:
4501
free_cpumask_var(cpumask);
4502
return ret;
4503
}
4504
4505
static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4506
{
4507
pr_devel("%s: dev id %d flags %llx\n", __func__,
4508
info->dev_id, info->flags);
4509
pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4510
info->nr_hw_queues, info->queue_depth);
4511
}
4512
4513
static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4514
{
4515
void __user *argp = (void __user *)(unsigned long)header->addr;
4516
struct ublksrv_ctrl_dev_info info;
4517
struct ublk_device *ub;
4518
int ret = -EINVAL;
4519
4520
if (header->len < sizeof(info) || !header->addr)
4521
return -EINVAL;
4522
if (header->queue_id != (u16)-1) {
4523
pr_warn("%s: queue_id is wrong %x\n",
4524
__func__, header->queue_id);
4525
return -EINVAL;
4526
}
4527
4528
if (copy_from_user(&info, argp, sizeof(info)))
4529
return -EFAULT;
4530
4531
if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4532
info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4533
return -EINVAL;
4534
4535
if (capable(CAP_SYS_ADMIN))
4536
info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4537
else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4538
return -EPERM;
4539
4540
/* forbid nonsense combinations of recovery flags */
4541
switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4542
case 0:
4543
case UBLK_F_USER_RECOVERY:
4544
case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4545
case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4546
break;
4547
default:
4548
pr_warn("%s: invalid recovery flags %llx\n", __func__,
4549
info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4550
return -EINVAL;
4551
}
4552
4553
if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4554
pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4555
return -EINVAL;
4556
}
4557
4558
/*
4559
* unprivileged device can't be trusted, but RECOVERY and
4560
* RECOVERY_REISSUE still may hang error handling, so can't
4561
* support recovery features for unprivileged ublk now
4562
*
4563
* TODO: provide forward progress for RECOVERY handler, so that
4564
* unprivileged device can benefit from it
4565
*/
4566
if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4567
info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4568
UBLK_F_USER_RECOVERY);
4569
4570
/*
4571
* For USER_COPY, we depends on userspace to fill request
4572
* buffer by pwrite() to ublk char device, which can't be
4573
* used for unprivileged device
4574
*
4575
* Same with zero copy or auto buffer register.
4576
*/
4577
if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4578
UBLK_F_AUTO_BUF_REG))
4579
return -EINVAL;
4580
}
4581
4582
/* User copy is required to access integrity buffer */
4583
if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4584
return -EINVAL;
4585
4586
/* the created device is always owned by current user */
4587
ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4588
4589
if (header->dev_id != info.dev_id) {
4590
pr_warn("%s: dev id not match %u %u\n",
4591
__func__, header->dev_id, info.dev_id);
4592
return -EINVAL;
4593
}
4594
4595
if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4596
pr_warn("%s: dev id is too large. Max supported is %d\n",
4597
__func__, UBLK_MAX_UBLKS - 1);
4598
return -EINVAL;
4599
}
4600
4601
ublk_dump_dev_info(&info);
4602
4603
ret = mutex_lock_killable(&ublk_ctl_mutex);
4604
if (ret)
4605
return ret;
4606
4607
ret = -EACCES;
4608
if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4609
unprivileged_ublks_added >= unprivileged_ublks_max)
4610
goto out_unlock;
4611
4612
ret = -ENOMEM;
4613
ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL);
4614
if (!ub)
4615
goto out_unlock;
4616
mutex_init(&ub->mutex);
4617
spin_lock_init(&ub->lock);
4618
mutex_init(&ub->cancel_mutex);
4619
INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4620
4621
ret = ublk_alloc_dev_number(ub, header->dev_id);
4622
if (ret < 0)
4623
goto out_free_ub;
4624
4625
memcpy(&ub->dev_info, &info, sizeof(info));
4626
4627
/* update device id */
4628
ub->dev_info.dev_id = ub->ub_number;
4629
4630
/*
4631
* 64bit flags will be copied back to userspace as feature
4632
* negotiation result, so have to clear flags which driver
4633
* doesn't support yet, then userspace can get correct flags
4634
* (features) to handle.
4635
*/
4636
ub->dev_info.flags &= UBLK_F_ALL;
4637
4638
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4639
UBLK_F_URING_CMD_COMP_IN_TASK |
4640
UBLK_F_PER_IO_DAEMON |
4641
UBLK_F_BUF_REG_OFF_DAEMON |
4642
UBLK_F_SAFE_STOP_DEV;
4643
4644
/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4645
if (ublk_dev_support_batch_io(ub))
4646
ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4647
4648
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4649
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4650
UBLK_F_AUTO_BUF_REG))
4651
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4652
4653
/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4654
if (ublk_dev_support_batch_io(ub))
4655
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4656
4657
/*
4658
* Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4659
* returning write_append_lba, which is only allowed in case of
4660
* user copy or zero copy
4661
*/
4662
if (ublk_dev_is_zoned(ub) &&
4663
(!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4664
(UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4665
ret = -EINVAL;
4666
goto out_free_dev_number;
4667
}
4668
4669
ub->dev_info.nr_hw_queues = min_t(unsigned int,
4670
ub->dev_info.nr_hw_queues, nr_cpu_ids);
4671
ublk_align_max_io_size(ub);
4672
4673
ret = ublk_add_tag_set(ub);
4674
if (ret)
4675
goto out_free_dev_number;
4676
4677
ret = ublk_init_queues(ub);
4678
if (ret)
4679
goto out_free_tag_set;
4680
4681
ret = -EFAULT;
4682
if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4683
goto out_deinit_queues;
4684
4685
/*
4686
* Add the char dev so that ublksrv daemon can be setup.
4687
* ublk_add_chdev() will cleanup everything if it fails.
4688
*/
4689
ret = ublk_add_chdev(ub);
4690
goto out_unlock;
4691
4692
out_deinit_queues:
4693
ublk_deinit_queues(ub);
4694
out_free_tag_set:
4695
blk_mq_free_tag_set(&ub->tag_set);
4696
out_free_dev_number:
4697
ublk_free_dev_number(ub);
4698
out_free_ub:
4699
mutex_destroy(&ub->mutex);
4700
mutex_destroy(&ub->cancel_mutex);
4701
kfree(ub);
4702
out_unlock:
4703
mutex_unlock(&ublk_ctl_mutex);
4704
return ret;
4705
}
4706
4707
static inline bool ublk_idr_freed(int id)
4708
{
4709
void *ptr;
4710
4711
spin_lock(&ublk_idr_lock);
4712
ptr = idr_find(&ublk_index_idr, id);
4713
spin_unlock(&ublk_idr_lock);
4714
4715
return ptr == NULL;
4716
}
4717
4718
static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4719
{
4720
struct ublk_device *ub = *p_ub;
4721
int idx = ub->ub_number;
4722
int ret;
4723
4724
ret = mutex_lock_killable(&ublk_ctl_mutex);
4725
if (ret)
4726
return ret;
4727
4728
if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4729
ublk_remove(ub);
4730
set_bit(UB_STATE_DELETED, &ub->state);
4731
}
4732
4733
/* Mark the reference as consumed */
4734
*p_ub = NULL;
4735
ublk_put_device(ub);
4736
mutex_unlock(&ublk_ctl_mutex);
4737
4738
/*
4739
* Wait until the idr is removed, then it can be reused after
4740
* DEL_DEV command is returned.
4741
*
4742
* If we returns because of user interrupt, future delete command
4743
* may come:
4744
*
4745
* - the device number isn't freed, this device won't or needn't
4746
* be deleted again, since UB_STATE_DELETED is set, and device
4747
* will be released after the last reference is dropped
4748
*
4749
* - the device number is freed already, we will not find this
4750
* device via ublk_get_device_from_id()
4751
*/
4752
if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4753
return -EINTR;
4754
return 0;
4755
}
4756
4757
static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4758
const struct ublksrv_ctrl_cmd *header)
4759
{
4760
pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4761
__func__, cmd_op, header->dev_id, header->queue_id,
4762
header->data[0], header->addr, header->len);
4763
}
4764
4765
static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4766
{
4767
ublk_stop_dev(ub);
4768
}
4769
4770
static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4771
{
4772
struct gendisk *disk;
4773
int ret = 0;
4774
4775
disk = ublk_get_disk(ub);
4776
if (!disk)
4777
return -ENODEV;
4778
4779
mutex_lock(&disk->open_mutex);
4780
if (disk_openers(disk) > 0) {
4781
ret = -EBUSY;
4782
goto unlock;
4783
}
4784
ub->block_open = true;
4785
/* release open_mutex as del_gendisk() will reacquire it */
4786
mutex_unlock(&disk->open_mutex);
4787
4788
ublk_ctrl_stop_dev(ub);
4789
goto out;
4790
4791
unlock:
4792
mutex_unlock(&disk->open_mutex);
4793
out:
4794
ublk_put_disk(disk);
4795
return ret;
4796
}
4797
4798
static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4799
const struct ublksrv_ctrl_cmd *header)
4800
{
4801
struct task_struct *p;
4802
struct pid *pid;
4803
struct ublksrv_ctrl_dev_info dev_info;
4804
pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4805
void __user *argp = (void __user *)(unsigned long)header->addr;
4806
4807
if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4808
return -EINVAL;
4809
4810
memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4811
dev_info.ublksrv_pid = -1;
4812
4813
if (init_ublksrv_tgid > 0) {
4814
rcu_read_lock();
4815
pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4816
p = pid_task(pid, PIDTYPE_TGID);
4817
if (p) {
4818
int vnr = task_tgid_vnr(p);
4819
4820
if (vnr)
4821
dev_info.ublksrv_pid = vnr;
4822
}
4823
rcu_read_unlock();
4824
}
4825
4826
if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4827
return -EFAULT;
4828
4829
return 0;
4830
}
4831
4832
/* TYPE_DEVT is readonly, so fill it up before returning to userspace */
4833
static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4834
{
4835
ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4836
ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4837
4838
if (ub->ub_disk) {
4839
ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4840
ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4841
} else {
4842
ub->params.devt.disk_major = 0;
4843
ub->params.devt.disk_minor = 0;
4844
}
4845
ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4846
}
4847
4848
static int ublk_ctrl_get_params(struct ublk_device *ub,
4849
const struct ublksrv_ctrl_cmd *header)
4850
{
4851
void __user *argp = (void __user *)(unsigned long)header->addr;
4852
struct ublk_params_header ph;
4853
int ret;
4854
4855
if (header->len <= sizeof(ph) || !header->addr)
4856
return -EINVAL;
4857
4858
if (copy_from_user(&ph, argp, sizeof(ph)))
4859
return -EFAULT;
4860
4861
if (ph.len > header->len || !ph.len)
4862
return -EINVAL;
4863
4864
if (ph.len > sizeof(struct ublk_params))
4865
ph.len = sizeof(struct ublk_params);
4866
4867
mutex_lock(&ub->mutex);
4868
ublk_ctrl_fill_params_devt(ub);
4869
if (copy_to_user(argp, &ub->params, ph.len))
4870
ret = -EFAULT;
4871
else
4872
ret = 0;
4873
mutex_unlock(&ub->mutex);
4874
4875
return ret;
4876
}
4877
4878
static int ublk_ctrl_set_params(struct ublk_device *ub,
4879
const struct ublksrv_ctrl_cmd *header)
4880
{
4881
void __user *argp = (void __user *)(unsigned long)header->addr;
4882
struct ublk_params_header ph;
4883
int ret = -EFAULT;
4884
4885
if (header->len <= sizeof(ph) || !header->addr)
4886
return -EINVAL;
4887
4888
if (copy_from_user(&ph, argp, sizeof(ph)))
4889
return -EFAULT;
4890
4891
if (ph.len > header->len || !ph.len || !ph.types)
4892
return -EINVAL;
4893
4894
if (ph.len > sizeof(struct ublk_params))
4895
ph.len = sizeof(struct ublk_params);
4896
4897
mutex_lock(&ub->mutex);
4898
if (test_bit(UB_STATE_USED, &ub->state)) {
4899
/*
4900
* Parameters can only be changed when device hasn't
4901
* been started yet
4902
*/
4903
ret = -EACCES;
4904
} else if (copy_from_user(&ub->params, argp, ph.len)) {
4905
ret = -EFAULT;
4906
} else {
4907
/* clear all we don't support yet */
4908
ub->params.types &= UBLK_PARAM_TYPE_ALL;
4909
ret = ublk_validate_params(ub);
4910
if (ret)
4911
ub->params.types = 0;
4912
}
4913
mutex_unlock(&ub->mutex);
4914
4915
return ret;
4916
}
4917
4918
static int ublk_ctrl_start_recovery(struct ublk_device *ub)
4919
{
4920
int ret = -EINVAL;
4921
4922
mutex_lock(&ub->mutex);
4923
if (ublk_nosrv_should_stop_dev(ub))
4924
goto out_unlock;
4925
/*
4926
* START_RECOVERY is only allowd after:
4927
*
4928
* (1) UB_STATE_OPEN is not set, which means the dying process is exited
4929
* and related io_uring ctx is freed so file struct of /dev/ublkcX is
4930
* released.
4931
*
4932
* and one of the following holds
4933
*
4934
* (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
4935
* (a)has quiesced request queue
4936
* (b)has requeued every inflight rqs whose io_flags is ACTIVE
4937
* (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
4938
* (d)has completed/camceled all ioucmds owned by ther dying process
4939
*
4940
* (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
4941
* quiesced, but all I/O is being immediately errored
4942
*/
4943
if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
4944
ret = -EBUSY;
4945
goto out_unlock;
4946
}
4947
pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
4948
init_completion(&ub->completion);
4949
ret = 0;
4950
out_unlock:
4951
mutex_unlock(&ub->mutex);
4952
return ret;
4953
}
4954
4955
static int ublk_ctrl_end_recovery(struct ublk_device *ub,
4956
const struct ublksrv_ctrl_cmd *header)
4957
{
4958
int ublksrv_pid = (int)header->data[0];
4959
int ret = -EINVAL;
4960
4961
pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
4962
header->dev_id);
4963
4964
if (wait_for_completion_interruptible(&ub->completion))
4965
return -EINTR;
4966
4967
pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
4968
header->dev_id);
4969
4970
if (!ublk_validate_user_pid(ub, ublksrv_pid))
4971
return -EINVAL;
4972
4973
mutex_lock(&ub->mutex);
4974
if (ublk_nosrv_should_stop_dev(ub))
4975
goto out_unlock;
4976
4977
if (!ublk_dev_in_recoverable_state(ub)) {
4978
ret = -EBUSY;
4979
goto out_unlock;
4980
}
4981
ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4982
ub->dev_info.state = UBLK_S_DEV_LIVE;
4983
pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
4984
__func__, ublksrv_pid, header->dev_id);
4985
blk_mq_kick_requeue_list(ub->ub_disk->queue);
4986
ret = 0;
4987
out_unlock:
4988
mutex_unlock(&ub->mutex);
4989
return ret;
4990
}
4991
4992
static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
4993
{
4994
void __user *argp = (void __user *)(unsigned long)header->addr;
4995
u64 features = UBLK_F_ALL;
4996
4997
if (header->len != UBLK_FEATURES_LEN || !header->addr)
4998
return -EINVAL;
4999
5000
if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5001
return -EFAULT;
5002
5003
return 0;
5004
}
5005
5006
static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5007
{
5008
struct ublk_param_basic *p = &ub->params.basic;
5009
u64 new_size = header->data[0];
5010
5011
mutex_lock(&ub->mutex);
5012
p->dev_sectors = new_size;
5013
set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5014
mutex_unlock(&ub->mutex);
5015
}
5016
5017
struct count_busy {
5018
const struct ublk_queue *ubq;
5019
unsigned int nr_busy;
5020
};
5021
5022
static bool ublk_count_busy_req(struct request *rq, void *data)
5023
{
5024
struct count_busy *idle = data;
5025
5026
if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5027
idle->nr_busy += 1;
5028
return true;
5029
}
5030
5031
/* uring_cmd is guaranteed to be active if the associated request is idle */
5032
static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5033
{
5034
struct count_busy data = {
5035
.ubq = ubq,
5036
};
5037
5038
blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5039
return data.nr_busy < ubq->q_depth;
5040
}
5041
5042
/* Wait until each hw queue has at least one idle IO */
5043
static int ublk_wait_for_idle_io(struct ublk_device *ub,
5044
unsigned int timeout_ms)
5045
{
5046
unsigned int elapsed = 0;
5047
int ret;
5048
5049
/*
5050
* For UBLK_F_BATCH_IO ublk server can get notified with existing
5051
* or new fetch command, so needn't wait any more
5052
*/
5053
if (ublk_dev_support_batch_io(ub))
5054
return 0;
5055
5056
while (elapsed < timeout_ms && !signal_pending(current)) {
5057
unsigned int queues_cancelable = 0;
5058
int i;
5059
5060
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5061
struct ublk_queue *ubq = ublk_get_queue(ub, i);
5062
5063
queues_cancelable += !!ubq_has_idle_io(ubq);
5064
}
5065
5066
/*
5067
* Each queue needs at least one active command for
5068
* notifying ublk server
5069
*/
5070
if (queues_cancelable == ub->dev_info.nr_hw_queues)
5071
break;
5072
5073
msleep(UBLK_REQUEUE_DELAY_MS);
5074
elapsed += UBLK_REQUEUE_DELAY_MS;
5075
}
5076
5077
if (signal_pending(current))
5078
ret = -EINTR;
5079
else if (elapsed >= timeout_ms)
5080
ret = -EBUSY;
5081
else
5082
ret = 0;
5083
5084
return ret;
5085
}
5086
5087
static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5088
const struct ublksrv_ctrl_cmd *header)
5089
{
5090
/* zero means wait forever */
5091
u64 timeout_ms = header->data[0];
5092
struct gendisk *disk;
5093
int ret = -ENODEV;
5094
5095
if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5096
return -EOPNOTSUPP;
5097
5098
mutex_lock(&ub->mutex);
5099
disk = ublk_get_disk(ub);
5100
if (!disk)
5101
goto unlock;
5102
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5103
goto put_disk;
5104
5105
ret = 0;
5106
/* already in expected state */
5107
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5108
goto put_disk;
5109
5110
/* Mark the device as canceling */
5111
mutex_lock(&ub->cancel_mutex);
5112
blk_mq_quiesce_queue(disk->queue);
5113
ublk_set_canceling(ub, true);
5114
blk_mq_unquiesce_queue(disk->queue);
5115
mutex_unlock(&ub->cancel_mutex);
5116
5117
if (!timeout_ms)
5118
timeout_ms = UINT_MAX;
5119
ret = ublk_wait_for_idle_io(ub, timeout_ms);
5120
5121
put_disk:
5122
ublk_put_disk(disk);
5123
unlock:
5124
mutex_unlock(&ub->mutex);
5125
5126
/* Cancel pending uring_cmd */
5127
if (!ret)
5128
ublk_cancel_dev(ub);
5129
return ret;
5130
}
5131
5132
/*
5133
* All control commands are sent via /dev/ublk-control, so we have to check
5134
* the destination device's permission
5135
*/
5136
static int ublk_char_dev_permission(struct ublk_device *ub,
5137
const char *dev_path, int mask)
5138
{
5139
int err;
5140
struct path path;
5141
struct kstat stat;
5142
5143
err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5144
if (err)
5145
return err;
5146
5147
err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5148
if (err)
5149
goto exit;
5150
5151
err = -EPERM;
5152
if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5153
goto exit;
5154
5155
err = inode_permission(&nop_mnt_idmap,
5156
d_backing_inode(path.dentry), mask);
5157
exit:
5158
path_put(&path);
5159
return err;
5160
}
5161
5162
static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5163
u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5164
{
5165
bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5166
void __user *argp = (void __user *)(unsigned long)header->addr;
5167
char *dev_path = NULL;
5168
int ret = 0;
5169
int mask;
5170
5171
if (!unprivileged) {
5172
if (!capable(CAP_SYS_ADMIN))
5173
return -EPERM;
5174
/*
5175
* The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5176
* char_dev_path in payload too, since userspace may not
5177
* know if the specified device is created as unprivileged
5178
* mode.
5179
*/
5180
if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5181
return 0;
5182
}
5183
5184
/*
5185
* User has to provide the char device path for unprivileged ublk
5186
*
5187
* header->addr always points to the dev path buffer, and
5188
* header->dev_path_len records length of dev path buffer.
5189
*/
5190
if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5191
return -EINVAL;
5192
5193
if (header->len < header->dev_path_len)
5194
return -EINVAL;
5195
5196
dev_path = memdup_user_nul(argp, header->dev_path_len);
5197
if (IS_ERR(dev_path))
5198
return PTR_ERR(dev_path);
5199
5200
ret = -EINVAL;
5201
switch (_IOC_NR(cmd_op)) {
5202
case UBLK_CMD_GET_DEV_INFO:
5203
case UBLK_CMD_GET_DEV_INFO2:
5204
case UBLK_CMD_GET_QUEUE_AFFINITY:
5205
case UBLK_CMD_GET_PARAMS:
5206
case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5207
mask = MAY_READ;
5208
break;
5209
case UBLK_CMD_START_DEV:
5210
case UBLK_CMD_STOP_DEV:
5211
case UBLK_CMD_ADD_DEV:
5212
case UBLK_CMD_DEL_DEV:
5213
case UBLK_CMD_SET_PARAMS:
5214
case UBLK_CMD_START_USER_RECOVERY:
5215
case UBLK_CMD_END_USER_RECOVERY:
5216
case UBLK_CMD_UPDATE_SIZE:
5217
case UBLK_CMD_QUIESCE_DEV:
5218
case UBLK_CMD_TRY_STOP_DEV:
5219
mask = MAY_READ | MAY_WRITE;
5220
break;
5221
default:
5222
goto exit;
5223
}
5224
5225
ret = ublk_char_dev_permission(ub, dev_path, mask);
5226
if (!ret) {
5227
header->len -= header->dev_path_len;
5228
header->addr += header->dev_path_len;
5229
}
5230
pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5231
__func__, ub->ub_number, cmd_op,
5232
ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5233
dev_path, ret);
5234
exit:
5235
kfree(dev_path);
5236
return ret;
5237
}
5238
5239
static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5240
{
5241
switch (_IOC_NR(cmd_op)) {
5242
case UBLK_CMD_GET_QUEUE_AFFINITY:
5243
case UBLK_CMD_GET_DEV_INFO:
5244
case UBLK_CMD_GET_DEV_INFO2:
5245
case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5246
return false;
5247
default:
5248
return true;
5249
}
5250
}
5251
5252
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5253
unsigned int issue_flags)
5254
{
5255
/* May point to userspace-mapped memory */
5256
const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
5257
struct ublksrv_ctrl_cmd header;
5258
struct ublk_device *ub = NULL;
5259
u32 cmd_op = cmd->cmd_op;
5260
int ret = -EINVAL;
5261
5262
if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5263
issue_flags & IO_URING_F_NONBLOCK)
5264
return -EAGAIN;
5265
5266
if (!(issue_flags & IO_URING_F_SQE128))
5267
return -EINVAL;
5268
5269
header.dev_id = READ_ONCE(ub_src->dev_id);
5270
header.queue_id = READ_ONCE(ub_src->queue_id);
5271
header.len = READ_ONCE(ub_src->len);
5272
header.addr = READ_ONCE(ub_src->addr);
5273
header.data[0] = READ_ONCE(ub_src->data[0]);
5274
header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5275
ublk_ctrl_cmd_dump(cmd_op, &header);
5276
5277
ret = ublk_check_cmd_op(cmd_op);
5278
if (ret)
5279
goto out;
5280
5281
if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5282
ret = ublk_ctrl_get_features(&header);
5283
goto out;
5284
}
5285
5286
if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5287
ret = -ENODEV;
5288
ub = ublk_get_device_from_id(header.dev_id);
5289
if (!ub)
5290
goto out;
5291
5292
ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5293
if (ret)
5294
goto put_dev;
5295
}
5296
5297
switch (_IOC_NR(cmd_op)) {
5298
case UBLK_CMD_START_DEV:
5299
ret = ublk_ctrl_start_dev(ub, &header);
5300
break;
5301
case UBLK_CMD_STOP_DEV:
5302
ublk_ctrl_stop_dev(ub);
5303
ret = 0;
5304
break;
5305
case UBLK_CMD_GET_DEV_INFO:
5306
case UBLK_CMD_GET_DEV_INFO2:
5307
ret = ublk_ctrl_get_dev_info(ub, &header);
5308
break;
5309
case UBLK_CMD_ADD_DEV:
5310
ret = ublk_ctrl_add_dev(&header);
5311
break;
5312
case UBLK_CMD_DEL_DEV:
5313
ret = ublk_ctrl_del_dev(&ub, true);
5314
break;
5315
case UBLK_CMD_DEL_DEV_ASYNC:
5316
ret = ublk_ctrl_del_dev(&ub, false);
5317
break;
5318
case UBLK_CMD_GET_QUEUE_AFFINITY:
5319
ret = ublk_ctrl_get_queue_affinity(ub, &header);
5320
break;
5321
case UBLK_CMD_GET_PARAMS:
5322
ret = ublk_ctrl_get_params(ub, &header);
5323
break;
5324
case UBLK_CMD_SET_PARAMS:
5325
ret = ublk_ctrl_set_params(ub, &header);
5326
break;
5327
case UBLK_CMD_START_USER_RECOVERY:
5328
ret = ublk_ctrl_start_recovery(ub);
5329
break;
5330
case UBLK_CMD_END_USER_RECOVERY:
5331
ret = ublk_ctrl_end_recovery(ub, &header);
5332
break;
5333
case UBLK_CMD_UPDATE_SIZE:
5334
ublk_ctrl_set_size(ub, &header);
5335
ret = 0;
5336
break;
5337
case UBLK_CMD_QUIESCE_DEV:
5338
ret = ublk_ctrl_quiesce_dev(ub, &header);
5339
break;
5340
case UBLK_CMD_TRY_STOP_DEV:
5341
ret = ublk_ctrl_try_stop_dev(ub);
5342
break;
5343
default:
5344
ret = -EOPNOTSUPP;
5345
break;
5346
}
5347
5348
put_dev:
5349
if (ub)
5350
ublk_put_device(ub);
5351
out:
5352
pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5353
__func__, ret, cmd_op, header.dev_id, header.queue_id);
5354
return ret;
5355
}
5356
5357
static const struct file_operations ublk_ctl_fops = {
5358
.open = nonseekable_open,
5359
.uring_cmd = ublk_ctrl_uring_cmd,
5360
.owner = THIS_MODULE,
5361
.llseek = noop_llseek,
5362
};
5363
5364
static struct miscdevice ublk_misc = {
5365
.minor = MISC_DYNAMIC_MINOR,
5366
.name = "ublk-control",
5367
.fops = &ublk_ctl_fops,
5368
};
5369
5370
static int __init ublk_init(void)
5371
{
5372
int ret;
5373
5374
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5375
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5376
/*
5377
* Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5378
* doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5379
*/
5380
BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5381
UBLKSRV_IO_INTEGRITY_FLAG);
5382
BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5383
5384
init_waitqueue_head(&ublk_idr_wq);
5385
5386
ret = misc_register(&ublk_misc);
5387
if (ret)
5388
return ret;
5389
5390
ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5391
if (ret)
5392
goto unregister_mis;
5393
5394
ret = class_register(&ublk_chr_class);
5395
if (ret)
5396
goto free_chrdev_region;
5397
5398
return 0;
5399
5400
free_chrdev_region:
5401
unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5402
unregister_mis:
5403
misc_deregister(&ublk_misc);
5404
return ret;
5405
}
5406
5407
static void __exit ublk_exit(void)
5408
{
5409
struct ublk_device *ub;
5410
int id;
5411
5412
idr_for_each_entry(&ublk_index_idr, ub, id)
5413
ublk_remove(ub);
5414
5415
class_unregister(&ublk_chr_class);
5416
misc_deregister(&ublk_misc);
5417
5418
idr_destroy(&ublk_index_idr);
5419
unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5420
}
5421
5422
module_init(ublk_init);
5423
module_exit(ublk_exit);
5424
5425
static int ublk_set_max_unprivileged_ublks(const char *buf,
5426
const struct kernel_param *kp)
5427
{
5428
return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5429
}
5430
5431
static int ublk_get_max_unprivileged_ublks(char *buf,
5432
const struct kernel_param *kp)
5433
{
5434
return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5435
}
5436
5437
static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5438
.set = ublk_set_max_unprivileged_ublks,
5439
.get = ublk_get_max_unprivileged_ublks,
5440
};
5441
5442
module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5443
&unprivileged_ublks_max, 0644);
5444
MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5445
5446
MODULE_AUTHOR("Ming Lei <[email protected]>");
5447
MODULE_DESCRIPTION("Userspace block device");
5448
MODULE_LICENSE("GPL");
5449
5450