Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/block/blk-mq.c
26242 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Block multiqueue core code
4
*
5
* Copyright (C) 2013-2014 Jens Axboe
6
* Copyright (C) 2013-2014 Christoph Hellwig
7
*/
8
#include <linux/kernel.h>
9
#include <linux/module.h>
10
#include <linux/backing-dev.h>
11
#include <linux/bio.h>
12
#include <linux/blkdev.h>
13
#include <linux/blk-integrity.h>
14
#include <linux/kmemleak.h>
15
#include <linux/mm.h>
16
#include <linux/init.h>
17
#include <linux/slab.h>
18
#include <linux/workqueue.h>
19
#include <linux/smp.h>
20
#include <linux/interrupt.h>
21
#include <linux/llist.h>
22
#include <linux/cpu.h>
23
#include <linux/cache.h>
24
#include <linux/sched/topology.h>
25
#include <linux/sched/signal.h>
26
#include <linux/delay.h>
27
#include <linux/crash_dump.h>
28
#include <linux/prefetch.h>
29
#include <linux/blk-crypto.h>
30
#include <linux/part_stat.h>
31
#include <linux/sched/isolation.h>
32
33
#include <trace/events/block.h>
34
35
#include <linux/t10-pi.h>
36
#include "blk.h"
37
#include "blk-mq.h"
38
#include "blk-mq-debugfs.h"
39
#include "blk-pm.h"
40
#include "blk-stat.h"
41
#include "blk-mq-sched.h"
42
#include "blk-rq-qos.h"
43
44
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
45
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
46
static DEFINE_MUTEX(blk_mq_cpuhp_lock);
47
48
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
49
static void blk_mq_request_bypass_insert(struct request *rq,
50
blk_insert_t flags);
51
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
52
struct list_head *list);
53
static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
54
struct io_comp_batch *iob, unsigned int flags);
55
56
/*
57
* Check if any of the ctx, dispatch list or elevator
58
* have pending work in this hardware queue.
59
*/
60
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
61
{
62
return !list_empty_careful(&hctx->dispatch) ||
63
sbitmap_any_bit_set(&hctx->ctx_map) ||
64
blk_mq_sched_has_work(hctx);
65
}
66
67
/*
68
* Mark this ctx as having pending work in this hardware queue
69
*/
70
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
71
struct blk_mq_ctx *ctx)
72
{
73
const int bit = ctx->index_hw[hctx->type];
74
75
if (!sbitmap_test_bit(&hctx->ctx_map, bit))
76
sbitmap_set_bit(&hctx->ctx_map, bit);
77
}
78
79
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
80
struct blk_mq_ctx *ctx)
81
{
82
const int bit = ctx->index_hw[hctx->type];
83
84
sbitmap_clear_bit(&hctx->ctx_map, bit);
85
}
86
87
struct mq_inflight {
88
struct block_device *part;
89
unsigned int inflight[2];
90
};
91
92
static bool blk_mq_check_in_driver(struct request *rq, void *priv)
93
{
94
struct mq_inflight *mi = priv;
95
96
if (rq->rq_flags & RQF_IO_STAT &&
97
(!bdev_is_partition(mi->part) || rq->part == mi->part) &&
98
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
99
mi->inflight[rq_data_dir(rq)]++;
100
101
return true;
102
}
103
104
void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
105
{
106
struct mq_inflight mi = { .part = part };
107
108
blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
109
&mi);
110
inflight[READ] = mi.inflight[READ];
111
inflight[WRITE] = mi.inflight[WRITE];
112
}
113
114
#ifdef CONFIG_LOCKDEP
115
static bool blk_freeze_set_owner(struct request_queue *q,
116
struct task_struct *owner)
117
{
118
if (!owner)
119
return false;
120
121
if (!q->mq_freeze_depth) {
122
q->mq_freeze_owner = owner;
123
q->mq_freeze_owner_depth = 1;
124
q->mq_freeze_disk_dead = !q->disk ||
125
test_bit(GD_DEAD, &q->disk->state) ||
126
!blk_queue_registered(q);
127
q->mq_freeze_queue_dying = blk_queue_dying(q);
128
return true;
129
}
130
131
if (owner == q->mq_freeze_owner)
132
q->mq_freeze_owner_depth += 1;
133
return false;
134
}
135
136
/* verify the last unfreeze in owner context */
137
static bool blk_unfreeze_check_owner(struct request_queue *q)
138
{
139
if (q->mq_freeze_owner != current)
140
return false;
141
if (--q->mq_freeze_owner_depth == 0) {
142
q->mq_freeze_owner = NULL;
143
return true;
144
}
145
return false;
146
}
147
148
#else
149
150
static bool blk_freeze_set_owner(struct request_queue *q,
151
struct task_struct *owner)
152
{
153
return false;
154
}
155
156
static bool blk_unfreeze_check_owner(struct request_queue *q)
157
{
158
return false;
159
}
160
#endif
161
162
bool __blk_freeze_queue_start(struct request_queue *q,
163
struct task_struct *owner)
164
{
165
bool freeze;
166
167
mutex_lock(&q->mq_freeze_lock);
168
freeze = blk_freeze_set_owner(q, owner);
169
if (++q->mq_freeze_depth == 1) {
170
percpu_ref_kill(&q->q_usage_counter);
171
mutex_unlock(&q->mq_freeze_lock);
172
if (queue_is_mq(q))
173
blk_mq_run_hw_queues(q, false);
174
} else {
175
mutex_unlock(&q->mq_freeze_lock);
176
}
177
178
return freeze;
179
}
180
181
void blk_freeze_queue_start(struct request_queue *q)
182
{
183
if (__blk_freeze_queue_start(q, current))
184
blk_freeze_acquire_lock(q);
185
}
186
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
187
188
void blk_mq_freeze_queue_wait(struct request_queue *q)
189
{
190
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
191
}
192
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
193
194
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
195
unsigned long timeout)
196
{
197
return wait_event_timeout(q->mq_freeze_wq,
198
percpu_ref_is_zero(&q->q_usage_counter),
199
timeout);
200
}
201
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
202
203
void blk_mq_freeze_queue_nomemsave(struct request_queue *q)
204
{
205
blk_freeze_queue_start(q);
206
blk_mq_freeze_queue_wait(q);
207
}
208
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave);
209
210
bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
211
{
212
bool unfreeze;
213
214
mutex_lock(&q->mq_freeze_lock);
215
if (force_atomic)
216
q->q_usage_counter.data->force_atomic = true;
217
q->mq_freeze_depth--;
218
WARN_ON_ONCE(q->mq_freeze_depth < 0);
219
if (!q->mq_freeze_depth) {
220
percpu_ref_resurrect(&q->q_usage_counter);
221
wake_up_all(&q->mq_freeze_wq);
222
}
223
unfreeze = blk_unfreeze_check_owner(q);
224
mutex_unlock(&q->mq_freeze_lock);
225
226
return unfreeze;
227
}
228
229
void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q)
230
{
231
if (__blk_mq_unfreeze_queue(q, false))
232
blk_unfreeze_release_lock(q);
233
}
234
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore);
235
236
/*
237
* non_owner variant of blk_freeze_queue_start
238
*
239
* Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
240
* by the same task. This is fragile and should not be used if at all
241
* possible.
242
*/
243
void blk_freeze_queue_start_non_owner(struct request_queue *q)
244
{
245
__blk_freeze_queue_start(q, NULL);
246
}
247
EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
248
249
/* non_owner variant of blk_mq_unfreeze_queue */
250
void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
251
{
252
__blk_mq_unfreeze_queue(q, false);
253
}
254
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);
255
256
/*
257
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
258
* mpt3sas driver such that this function can be removed.
259
*/
260
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
261
{
262
unsigned long flags;
263
264
spin_lock_irqsave(&q->queue_lock, flags);
265
if (!q->quiesce_depth++)
266
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
267
spin_unlock_irqrestore(&q->queue_lock, flags);
268
}
269
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
270
271
/**
272
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
273
* @set: tag_set to wait on
274
*
275
* Note: it is driver's responsibility for making sure that quiesce has
276
* been started on or more of the request_queues of the tag_set. This
277
* function only waits for the quiesce on those request_queues that had
278
* the quiesce flag set using blk_mq_quiesce_queue_nowait.
279
*/
280
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
281
{
282
if (set->flags & BLK_MQ_F_BLOCKING)
283
synchronize_srcu(set->srcu);
284
else
285
synchronize_rcu();
286
}
287
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
288
289
/**
290
* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
291
* @q: request queue.
292
*
293
* Note: this function does not prevent that the struct request end_io()
294
* callback function is invoked. Once this function is returned, we make
295
* sure no dispatch can happen until the queue is unquiesced via
296
* blk_mq_unquiesce_queue().
297
*/
298
void blk_mq_quiesce_queue(struct request_queue *q)
299
{
300
blk_mq_quiesce_queue_nowait(q);
301
/* nothing to wait for non-mq queues */
302
if (queue_is_mq(q))
303
blk_mq_wait_quiesce_done(q->tag_set);
304
}
305
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
306
307
/*
308
* blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
309
* @q: request queue.
310
*
311
* This function recovers queue into the state before quiescing
312
* which is done by blk_mq_quiesce_queue.
313
*/
314
void blk_mq_unquiesce_queue(struct request_queue *q)
315
{
316
unsigned long flags;
317
bool run_queue = false;
318
319
spin_lock_irqsave(&q->queue_lock, flags);
320
if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
321
;
322
} else if (!--q->quiesce_depth) {
323
blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
324
run_queue = true;
325
}
326
spin_unlock_irqrestore(&q->queue_lock, flags);
327
328
/* dispatch requests which are inserted during quiescing */
329
if (run_queue)
330
blk_mq_run_hw_queues(q, true);
331
}
332
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
333
334
void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
335
{
336
struct request_queue *q;
337
338
mutex_lock(&set->tag_list_lock);
339
list_for_each_entry(q, &set->tag_list, tag_set_list) {
340
if (!blk_queue_skip_tagset_quiesce(q))
341
blk_mq_quiesce_queue_nowait(q);
342
}
343
mutex_unlock(&set->tag_list_lock);
344
345
blk_mq_wait_quiesce_done(set);
346
}
347
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
348
349
void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
350
{
351
struct request_queue *q;
352
353
mutex_lock(&set->tag_list_lock);
354
list_for_each_entry(q, &set->tag_list, tag_set_list) {
355
if (!blk_queue_skip_tagset_quiesce(q))
356
blk_mq_unquiesce_queue(q);
357
}
358
mutex_unlock(&set->tag_list_lock);
359
}
360
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
361
362
void blk_mq_wake_waiters(struct request_queue *q)
363
{
364
struct blk_mq_hw_ctx *hctx;
365
unsigned long i;
366
367
queue_for_each_hw_ctx(q, hctx, i)
368
if (blk_mq_hw_queue_mapped(hctx))
369
blk_mq_tag_wakeup_all(hctx->tags, true);
370
}
371
372
void blk_rq_init(struct request_queue *q, struct request *rq)
373
{
374
memset(rq, 0, sizeof(*rq));
375
376
INIT_LIST_HEAD(&rq->queuelist);
377
rq->q = q;
378
rq->__sector = (sector_t) -1;
379
INIT_HLIST_NODE(&rq->hash);
380
RB_CLEAR_NODE(&rq->rb_node);
381
rq->tag = BLK_MQ_NO_TAG;
382
rq->internal_tag = BLK_MQ_NO_TAG;
383
rq->start_time_ns = blk_time_get_ns();
384
blk_crypto_rq_set_defaults(rq);
385
}
386
EXPORT_SYMBOL(blk_rq_init);
387
388
/* Set start and alloc time when the allocated request is actually used */
389
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
390
{
391
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
392
if (blk_queue_rq_alloc_time(rq->q))
393
rq->alloc_time_ns = alloc_time_ns;
394
else
395
rq->alloc_time_ns = 0;
396
#endif
397
}
398
399
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
400
struct blk_mq_tags *tags, unsigned int tag)
401
{
402
struct blk_mq_ctx *ctx = data->ctx;
403
struct blk_mq_hw_ctx *hctx = data->hctx;
404
struct request_queue *q = data->q;
405
struct request *rq = tags->static_rqs[tag];
406
407
rq->q = q;
408
rq->mq_ctx = ctx;
409
rq->mq_hctx = hctx;
410
rq->cmd_flags = data->cmd_flags;
411
412
if (data->flags & BLK_MQ_REQ_PM)
413
data->rq_flags |= RQF_PM;
414
rq->rq_flags = data->rq_flags;
415
416
if (data->rq_flags & RQF_SCHED_TAGS) {
417
rq->tag = BLK_MQ_NO_TAG;
418
rq->internal_tag = tag;
419
} else {
420
rq->tag = tag;
421
rq->internal_tag = BLK_MQ_NO_TAG;
422
}
423
rq->timeout = 0;
424
425
rq->part = NULL;
426
rq->io_start_time_ns = 0;
427
rq->stats_sectors = 0;
428
rq->nr_phys_segments = 0;
429
rq->nr_integrity_segments = 0;
430
rq->end_io = NULL;
431
rq->end_io_data = NULL;
432
433
blk_crypto_rq_set_defaults(rq);
434
INIT_LIST_HEAD(&rq->queuelist);
435
/* tag was already set */
436
WRITE_ONCE(rq->deadline, 0);
437
req_ref_set(rq, 1);
438
439
if (rq->rq_flags & RQF_USE_SCHED) {
440
struct elevator_queue *e = data->q->elevator;
441
442
INIT_HLIST_NODE(&rq->hash);
443
RB_CLEAR_NODE(&rq->rb_node);
444
445
if (e->type->ops.prepare_request)
446
e->type->ops.prepare_request(rq);
447
}
448
449
return rq;
450
}
451
452
static inline struct request *
453
__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
454
{
455
unsigned int tag, tag_offset;
456
struct blk_mq_tags *tags;
457
struct request *rq;
458
unsigned long tag_mask;
459
int i, nr = 0;
460
461
tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
462
if (unlikely(!tag_mask))
463
return NULL;
464
465
tags = blk_mq_tags_from_data(data);
466
for (i = 0; tag_mask; i++) {
467
if (!(tag_mask & (1UL << i)))
468
continue;
469
tag = tag_offset + i;
470
prefetch(tags->static_rqs[tag]);
471
tag_mask &= ~(1UL << i);
472
rq = blk_mq_rq_ctx_init(data, tags, tag);
473
rq_list_add_head(data->cached_rqs, rq);
474
nr++;
475
}
476
if (!(data->rq_flags & RQF_SCHED_TAGS))
477
blk_mq_add_active_requests(data->hctx, nr);
478
/* caller already holds a reference, add for remainder */
479
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
480
data->nr_tags -= nr;
481
482
return rq_list_pop(data->cached_rqs);
483
}
484
485
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
486
{
487
struct request_queue *q = data->q;
488
u64 alloc_time_ns = 0;
489
struct request *rq;
490
unsigned int tag;
491
492
/* alloc_time includes depth and tag waits */
493
if (blk_queue_rq_alloc_time(q))
494
alloc_time_ns = blk_time_get_ns();
495
496
if (data->cmd_flags & REQ_NOWAIT)
497
data->flags |= BLK_MQ_REQ_NOWAIT;
498
499
retry:
500
data->ctx = blk_mq_get_ctx(q);
501
data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
502
503
if (q->elevator) {
504
/*
505
* All requests use scheduler tags when an I/O scheduler is
506
* enabled for the queue.
507
*/
508
data->rq_flags |= RQF_SCHED_TAGS;
509
510
/*
511
* Flush/passthrough requests are special and go directly to the
512
* dispatch list.
513
*/
514
if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
515
!blk_op_is_passthrough(data->cmd_flags)) {
516
struct elevator_mq_ops *ops = &q->elevator->type->ops;
517
518
WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
519
520
data->rq_flags |= RQF_USE_SCHED;
521
if (ops->limit_depth)
522
ops->limit_depth(data->cmd_flags, data);
523
}
524
} else {
525
blk_mq_tag_busy(data->hctx);
526
}
527
528
if (data->flags & BLK_MQ_REQ_RESERVED)
529
data->rq_flags |= RQF_RESV;
530
531
/*
532
* Try batched alloc if we want more than 1 tag.
533
*/
534
if (data->nr_tags > 1) {
535
rq = __blk_mq_alloc_requests_batch(data);
536
if (rq) {
537
blk_mq_rq_time_init(rq, alloc_time_ns);
538
return rq;
539
}
540
data->nr_tags = 1;
541
}
542
543
/*
544
* Waiting allocations only fail because of an inactive hctx. In that
545
* case just retry the hctx assignment and tag allocation as CPU hotplug
546
* should have migrated us to an online CPU by now.
547
*/
548
tag = blk_mq_get_tag(data);
549
if (tag == BLK_MQ_NO_TAG) {
550
if (data->flags & BLK_MQ_REQ_NOWAIT)
551
return NULL;
552
/*
553
* Give up the CPU and sleep for a random short time to
554
* ensure that thread using a realtime scheduling class
555
* are migrated off the CPU, and thus off the hctx that
556
* is going away.
557
*/
558
msleep(3);
559
goto retry;
560
}
561
562
if (!(data->rq_flags & RQF_SCHED_TAGS))
563
blk_mq_inc_active_requests(data->hctx);
564
rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
565
blk_mq_rq_time_init(rq, alloc_time_ns);
566
return rq;
567
}
568
569
static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
570
struct blk_plug *plug,
571
blk_opf_t opf,
572
blk_mq_req_flags_t flags)
573
{
574
struct blk_mq_alloc_data data = {
575
.q = q,
576
.flags = flags,
577
.shallow_depth = 0,
578
.cmd_flags = opf,
579
.rq_flags = 0,
580
.nr_tags = plug->nr_ios,
581
.cached_rqs = &plug->cached_rqs,
582
.ctx = NULL,
583
.hctx = NULL
584
};
585
struct request *rq;
586
587
if (blk_queue_enter(q, flags))
588
return NULL;
589
590
plug->nr_ios = 1;
591
592
rq = __blk_mq_alloc_requests(&data);
593
if (unlikely(!rq))
594
blk_queue_exit(q);
595
return rq;
596
}
597
598
static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
599
blk_opf_t opf,
600
blk_mq_req_flags_t flags)
601
{
602
struct blk_plug *plug = current->plug;
603
struct request *rq;
604
605
if (!plug)
606
return NULL;
607
608
if (rq_list_empty(&plug->cached_rqs)) {
609
if (plug->nr_ios == 1)
610
return NULL;
611
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
612
if (!rq)
613
return NULL;
614
} else {
615
rq = rq_list_peek(&plug->cached_rqs);
616
if (!rq || rq->q != q)
617
return NULL;
618
619
if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
620
return NULL;
621
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
622
return NULL;
623
624
rq_list_pop(&plug->cached_rqs);
625
blk_mq_rq_time_init(rq, blk_time_get_ns());
626
}
627
628
rq->cmd_flags = opf;
629
INIT_LIST_HEAD(&rq->queuelist);
630
return rq;
631
}
632
633
struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
634
blk_mq_req_flags_t flags)
635
{
636
struct request *rq;
637
638
rq = blk_mq_alloc_cached_request(q, opf, flags);
639
if (!rq) {
640
struct blk_mq_alloc_data data = {
641
.q = q,
642
.flags = flags,
643
.shallow_depth = 0,
644
.cmd_flags = opf,
645
.rq_flags = 0,
646
.nr_tags = 1,
647
.cached_rqs = NULL,
648
.ctx = NULL,
649
.hctx = NULL
650
};
651
int ret;
652
653
ret = blk_queue_enter(q, flags);
654
if (ret)
655
return ERR_PTR(ret);
656
657
rq = __blk_mq_alloc_requests(&data);
658
if (!rq)
659
goto out_queue_exit;
660
}
661
rq->__data_len = 0;
662
rq->__sector = (sector_t) -1;
663
rq->bio = rq->biotail = NULL;
664
return rq;
665
out_queue_exit:
666
blk_queue_exit(q);
667
return ERR_PTR(-EWOULDBLOCK);
668
}
669
EXPORT_SYMBOL(blk_mq_alloc_request);
670
671
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
672
blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
673
{
674
struct blk_mq_alloc_data data = {
675
.q = q,
676
.flags = flags,
677
.shallow_depth = 0,
678
.cmd_flags = opf,
679
.rq_flags = 0,
680
.nr_tags = 1,
681
.cached_rqs = NULL,
682
.ctx = NULL,
683
.hctx = NULL
684
};
685
u64 alloc_time_ns = 0;
686
struct request *rq;
687
unsigned int cpu;
688
unsigned int tag;
689
int ret;
690
691
/* alloc_time includes depth and tag waits */
692
if (blk_queue_rq_alloc_time(q))
693
alloc_time_ns = blk_time_get_ns();
694
695
/*
696
* If the tag allocator sleeps we could get an allocation for a
697
* different hardware context. No need to complicate the low level
698
* allocator for this for the rare use case of a command tied to
699
* a specific queue.
700
*/
701
if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
702
WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
703
return ERR_PTR(-EINVAL);
704
705
if (hctx_idx >= q->nr_hw_queues)
706
return ERR_PTR(-EIO);
707
708
ret = blk_queue_enter(q, flags);
709
if (ret)
710
return ERR_PTR(ret);
711
712
/*
713
* Check if the hardware context is actually mapped to anything.
714
* If not tell the caller that it should skip this queue.
715
*/
716
ret = -EXDEV;
717
data.hctx = xa_load(&q->hctx_table, hctx_idx);
718
if (!blk_mq_hw_queue_mapped(data.hctx))
719
goto out_queue_exit;
720
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
721
if (cpu >= nr_cpu_ids)
722
goto out_queue_exit;
723
data.ctx = __blk_mq_get_ctx(q, cpu);
724
725
if (q->elevator)
726
data.rq_flags |= RQF_SCHED_TAGS;
727
else
728
blk_mq_tag_busy(data.hctx);
729
730
if (flags & BLK_MQ_REQ_RESERVED)
731
data.rq_flags |= RQF_RESV;
732
733
ret = -EWOULDBLOCK;
734
tag = blk_mq_get_tag(&data);
735
if (tag == BLK_MQ_NO_TAG)
736
goto out_queue_exit;
737
if (!(data.rq_flags & RQF_SCHED_TAGS))
738
blk_mq_inc_active_requests(data.hctx);
739
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
740
blk_mq_rq_time_init(rq, alloc_time_ns);
741
rq->__data_len = 0;
742
rq->__sector = (sector_t) -1;
743
rq->bio = rq->biotail = NULL;
744
return rq;
745
746
out_queue_exit:
747
blk_queue_exit(q);
748
return ERR_PTR(ret);
749
}
750
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
751
752
static void blk_mq_finish_request(struct request *rq)
753
{
754
struct request_queue *q = rq->q;
755
756
blk_zone_finish_request(rq);
757
758
if (rq->rq_flags & RQF_USE_SCHED) {
759
q->elevator->type->ops.finish_request(rq);
760
/*
761
* For postflush request that may need to be
762
* completed twice, we should clear this flag
763
* to avoid double finish_request() on the rq.
764
*/
765
rq->rq_flags &= ~RQF_USE_SCHED;
766
}
767
}
768
769
static void __blk_mq_free_request(struct request *rq)
770
{
771
struct request_queue *q = rq->q;
772
struct blk_mq_ctx *ctx = rq->mq_ctx;
773
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
774
const int sched_tag = rq->internal_tag;
775
776
blk_crypto_free_request(rq);
777
blk_pm_mark_last_busy(rq);
778
rq->mq_hctx = NULL;
779
780
if (rq->tag != BLK_MQ_NO_TAG) {
781
blk_mq_dec_active_requests(hctx);
782
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
783
}
784
if (sched_tag != BLK_MQ_NO_TAG)
785
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
786
blk_mq_sched_restart(hctx);
787
blk_queue_exit(q);
788
}
789
790
void blk_mq_free_request(struct request *rq)
791
{
792
struct request_queue *q = rq->q;
793
794
blk_mq_finish_request(rq);
795
796
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
797
laptop_io_completion(q->disk->bdi);
798
799
rq_qos_done(q, rq);
800
801
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
802
if (req_ref_put_and_test(rq))
803
__blk_mq_free_request(rq);
804
}
805
EXPORT_SYMBOL_GPL(blk_mq_free_request);
806
807
void blk_mq_free_plug_rqs(struct blk_plug *plug)
808
{
809
struct request *rq;
810
811
while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
812
blk_mq_free_request(rq);
813
}
814
815
void blk_dump_rq_flags(struct request *rq, char *msg)
816
{
817
printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
818
rq->q->disk ? rq->q->disk->disk_name : "?",
819
(__force unsigned long long) rq->cmd_flags);
820
821
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
822
(unsigned long long)blk_rq_pos(rq),
823
blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
824
printk(KERN_INFO " bio %p, biotail %p, len %u\n",
825
rq->bio, rq->biotail, blk_rq_bytes(rq));
826
}
827
EXPORT_SYMBOL(blk_dump_rq_flags);
828
829
static void blk_account_io_completion(struct request *req, unsigned int bytes)
830
{
831
if (req->rq_flags & RQF_IO_STAT) {
832
const int sgrp = op_stat_group(req_op(req));
833
834
part_stat_lock();
835
part_stat_add(req->part, sectors[sgrp], bytes >> 9);
836
part_stat_unlock();
837
}
838
}
839
840
static void blk_print_req_error(struct request *req, blk_status_t status)
841
{
842
printk_ratelimited(KERN_ERR
843
"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
844
"phys_seg %u prio class %u\n",
845
blk_status_to_str(status),
846
req->q->disk ? req->q->disk->disk_name : "?",
847
blk_rq_pos(req), (__force u32)req_op(req),
848
blk_op_str(req_op(req)),
849
(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
850
req->nr_phys_segments,
851
IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
852
}
853
854
/*
855
* Fully end IO on a request. Does not support partial completions, or
856
* errors.
857
*/
858
static void blk_complete_request(struct request *req)
859
{
860
const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
861
int total_bytes = blk_rq_bytes(req);
862
struct bio *bio = req->bio;
863
864
trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
865
866
if (!bio)
867
return;
868
869
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
870
blk_integrity_complete(req, total_bytes);
871
872
/*
873
* Upper layers may call blk_crypto_evict_key() anytime after the last
874
* bio_endio(). Therefore, the keyslot must be released before that.
875
*/
876
blk_crypto_rq_put_keyslot(req);
877
878
blk_account_io_completion(req, total_bytes);
879
880
do {
881
struct bio *next = bio->bi_next;
882
883
/* Completion has already been traced */
884
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
885
886
if (blk_req_bio_is_zone_append(req, bio))
887
blk_zone_append_update_request_bio(req, bio);
888
889
if (!is_flush)
890
bio_endio(bio);
891
bio = next;
892
} while (bio);
893
894
/*
895
* Reset counters so that the request stacking driver
896
* can find how many bytes remain in the request
897
* later.
898
*/
899
if (!req->end_io) {
900
req->bio = NULL;
901
req->__data_len = 0;
902
}
903
}
904
905
/**
906
* blk_update_request - Complete multiple bytes without completing the request
907
* @req: the request being processed
908
* @error: block status code
909
* @nr_bytes: number of bytes to complete for @req
910
*
911
* Description:
912
* Ends I/O on a number of bytes attached to @req, but doesn't complete
913
* the request structure even if @req doesn't have leftover.
914
* If @req has leftover, sets it up for the next range of segments.
915
*
916
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
917
* %false return from this function.
918
*
919
* Note:
920
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
921
* except in the consistency check at the end of this function.
922
*
923
* Return:
924
* %false - this request doesn't have any more data
925
* %true - this request has more data
926
**/
927
bool blk_update_request(struct request *req, blk_status_t error,
928
unsigned int nr_bytes)
929
{
930
bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
931
bool quiet = req->rq_flags & RQF_QUIET;
932
int total_bytes;
933
934
trace_block_rq_complete(req, error, nr_bytes);
935
936
if (!req->bio)
937
return false;
938
939
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
940
error == BLK_STS_OK)
941
blk_integrity_complete(req, nr_bytes);
942
943
/*
944
* Upper layers may call blk_crypto_evict_key() anytime after the last
945
* bio_endio(). Therefore, the keyslot must be released before that.
946
*/
947
if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
948
__blk_crypto_rq_put_keyslot(req);
949
950
if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
951
!test_bit(GD_DEAD, &req->q->disk->state)) {
952
blk_print_req_error(req, error);
953
trace_block_rq_error(req, error, nr_bytes);
954
}
955
956
blk_account_io_completion(req, nr_bytes);
957
958
total_bytes = 0;
959
while (req->bio) {
960
struct bio *bio = req->bio;
961
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
962
963
if (unlikely(error))
964
bio->bi_status = error;
965
966
if (bio_bytes == bio->bi_iter.bi_size) {
967
req->bio = bio->bi_next;
968
} else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
969
/*
970
* Partial zone append completions cannot be supported
971
* as the BIO fragments may end up not being written
972
* sequentially.
973
*/
974
bio->bi_status = BLK_STS_IOERR;
975
}
976
977
/* Completion has already been traced */
978
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
979
if (unlikely(quiet))
980
bio_set_flag(bio, BIO_QUIET);
981
982
bio_advance(bio, bio_bytes);
983
984
/* Don't actually finish bio if it's part of flush sequence */
985
if (!bio->bi_iter.bi_size) {
986
if (blk_req_bio_is_zone_append(req, bio))
987
blk_zone_append_update_request_bio(req, bio);
988
if (!is_flush)
989
bio_endio(bio);
990
}
991
992
total_bytes += bio_bytes;
993
nr_bytes -= bio_bytes;
994
995
if (!nr_bytes)
996
break;
997
}
998
999
/*
1000
* completely done
1001
*/
1002
if (!req->bio) {
1003
/*
1004
* Reset counters so that the request stacking driver
1005
* can find how many bytes remain in the request
1006
* later.
1007
*/
1008
req->__data_len = 0;
1009
return false;
1010
}
1011
1012
req->__data_len -= total_bytes;
1013
1014
/* update sector only for requests with clear definition of sector */
1015
if (!blk_rq_is_passthrough(req))
1016
req->__sector += total_bytes >> 9;
1017
1018
/* mixed attributes always follow the first bio */
1019
if (req->rq_flags & RQF_MIXED_MERGE) {
1020
req->cmd_flags &= ~REQ_FAILFAST_MASK;
1021
req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
1022
}
1023
1024
if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
1025
/*
1026
* If total number of sectors is less than the first segment
1027
* size, something has gone terribly wrong.
1028
*/
1029
if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
1030
blk_dump_rq_flags(req, "request botched");
1031
req->__data_len = blk_rq_cur_bytes(req);
1032
}
1033
1034
/* recalculate the number of segments */
1035
req->nr_phys_segments = blk_recalc_rq_segments(req);
1036
}
1037
1038
return true;
1039
}
1040
EXPORT_SYMBOL_GPL(blk_update_request);
1041
1042
static inline void blk_account_io_done(struct request *req, u64 now)
1043
{
1044
trace_block_io_done(req);
1045
1046
/*
1047
* Account IO completion. flush_rq isn't accounted as a
1048
* normal IO on queueing nor completion. Accounting the
1049
* containing request is enough.
1050
*/
1051
if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
1052
const int sgrp = op_stat_group(req_op(req));
1053
1054
part_stat_lock();
1055
update_io_ticks(req->part, jiffies, true);
1056
part_stat_inc(req->part, ios[sgrp]);
1057
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
1058
part_stat_local_dec(req->part,
1059
in_flight[op_is_write(req_op(req))]);
1060
part_stat_unlock();
1061
}
1062
}
1063
1064
static inline bool blk_rq_passthrough_stats(struct request *req)
1065
{
1066
struct bio *bio = req->bio;
1067
1068
if (!blk_queue_passthrough_stat(req->q))
1069
return false;
1070
1071
/* Requests without a bio do not transfer data. */
1072
if (!bio)
1073
return false;
1074
1075
/*
1076
* Stats are accumulated in the bdev, so must have one attached to a
1077
* bio to track stats. Most drivers do not set the bdev for passthrough
1078
* requests, but nvme is one that will set it.
1079
*/
1080
if (!bio->bi_bdev)
1081
return false;
1082
1083
/*
1084
* We don't know what a passthrough command does, but we know the
1085
* payload size and data direction. Ensuring the size is aligned to the
1086
* block size filters out most commands with payloads that don't
1087
* represent sector access.
1088
*/
1089
if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
1090
return false;
1091
return true;
1092
}
1093
1094
static inline void blk_account_io_start(struct request *req)
1095
{
1096
trace_block_io_start(req);
1097
1098
if (!blk_queue_io_stat(req->q))
1099
return;
1100
if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
1101
return;
1102
1103
req->rq_flags |= RQF_IO_STAT;
1104
req->start_time_ns = blk_time_get_ns();
1105
1106
/*
1107
* All non-passthrough requests are created from a bio with one
1108
* exception: when a flush command that is part of a flush sequence
1109
* generated by the state machine in blk-flush.c is cloned onto the
1110
* lower device by dm-multipath we can get here without a bio.
1111
*/
1112
if (req->bio)
1113
req->part = req->bio->bi_bdev;
1114
else
1115
req->part = req->q->disk->part0;
1116
1117
part_stat_lock();
1118
update_io_ticks(req->part, jiffies, false);
1119
part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
1120
part_stat_unlock();
1121
}
1122
1123
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
1124
{
1125
if (rq->rq_flags & RQF_STATS)
1126
blk_stat_add(rq, now);
1127
1128
blk_mq_sched_completed_request(rq, now);
1129
blk_account_io_done(rq, now);
1130
}
1131
1132
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
1133
{
1134
if (blk_mq_need_time_stamp(rq))
1135
__blk_mq_end_request_acct(rq, blk_time_get_ns());
1136
1137
blk_mq_finish_request(rq);
1138
1139
if (rq->end_io) {
1140
rq_qos_done(rq->q, rq);
1141
if (rq->end_io(rq, error) == RQ_END_IO_FREE)
1142
blk_mq_free_request(rq);
1143
} else {
1144
blk_mq_free_request(rq);
1145
}
1146
}
1147
EXPORT_SYMBOL(__blk_mq_end_request);
1148
1149
void blk_mq_end_request(struct request *rq, blk_status_t error)
1150
{
1151
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
1152
BUG();
1153
__blk_mq_end_request(rq, error);
1154
}
1155
EXPORT_SYMBOL(blk_mq_end_request);
1156
1157
#define TAG_COMP_BATCH 32
1158
1159
static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
1160
int *tag_array, int nr_tags)
1161
{
1162
struct request_queue *q = hctx->queue;
1163
1164
blk_mq_sub_active_requests(hctx, nr_tags);
1165
1166
blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
1167
percpu_ref_put_many(&q->q_usage_counter, nr_tags);
1168
}
1169
1170
void blk_mq_end_request_batch(struct io_comp_batch *iob)
1171
{
1172
int tags[TAG_COMP_BATCH], nr_tags = 0;
1173
struct blk_mq_hw_ctx *cur_hctx = NULL;
1174
struct request *rq;
1175
u64 now = 0;
1176
1177
if (iob->need_ts)
1178
now = blk_time_get_ns();
1179
1180
while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
1181
prefetch(rq->bio);
1182
prefetch(rq->rq_next);
1183
1184
blk_complete_request(rq);
1185
if (iob->need_ts)
1186
__blk_mq_end_request_acct(rq, now);
1187
1188
blk_mq_finish_request(rq);
1189
1190
rq_qos_done(rq->q, rq);
1191
1192
/*
1193
* If end_io handler returns NONE, then it still has
1194
* ownership of the request.
1195
*/
1196
if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
1197
continue;
1198
1199
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1200
if (!req_ref_put_and_test(rq))
1201
continue;
1202
1203
blk_crypto_free_request(rq);
1204
blk_pm_mark_last_busy(rq);
1205
1206
if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
1207
if (cur_hctx)
1208
blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
1209
nr_tags = 0;
1210
cur_hctx = rq->mq_hctx;
1211
}
1212
tags[nr_tags++] = rq->tag;
1213
}
1214
1215
if (nr_tags)
1216
blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
1217
}
1218
EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
1219
1220
static void blk_complete_reqs(struct llist_head *list)
1221
{
1222
struct llist_node *entry = llist_reverse_order(llist_del_all(list));
1223
struct request *rq, *next;
1224
1225
llist_for_each_entry_safe(rq, next, entry, ipi_list)
1226
rq->q->mq_ops->complete(rq);
1227
}
1228
1229
static __latent_entropy void blk_done_softirq(void)
1230
{
1231
blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
1232
}
1233
1234
static int blk_softirq_cpu_dead(unsigned int cpu)
1235
{
1236
blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
1237
return 0;
1238
}
1239
1240
static void __blk_mq_complete_request_remote(void *data)
1241
{
1242
__raise_softirq_irqoff(BLOCK_SOFTIRQ);
1243
}
1244
1245
static inline bool blk_mq_complete_need_ipi(struct request *rq)
1246
{
1247
int cpu = raw_smp_processor_id();
1248
1249
if (!IS_ENABLED(CONFIG_SMP) ||
1250
!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
1251
return false;
1252
/*
1253
* With force threaded interrupts enabled, raising softirq from an SMP
1254
* function call will always result in waking the ksoftirqd thread.
1255
* This is probably worse than completing the request on a different
1256
* cache domain.
1257
*/
1258
if (force_irqthreads())
1259
return false;
1260
1261
/* same CPU or cache domain and capacity? Complete locally */
1262
if (cpu == rq->mq_ctx->cpu ||
1263
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
1264
cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
1265
cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
1266
return false;
1267
1268
/* don't try to IPI to an offline CPU */
1269
return cpu_online(rq->mq_ctx->cpu);
1270
}
1271
1272
static void blk_mq_complete_send_ipi(struct request *rq)
1273
{
1274
unsigned int cpu;
1275
1276
cpu = rq->mq_ctx->cpu;
1277
if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
1278
smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
1279
}
1280
1281
static void blk_mq_raise_softirq(struct request *rq)
1282
{
1283
struct llist_head *list;
1284
1285
preempt_disable();
1286
list = this_cpu_ptr(&blk_cpu_done);
1287
if (llist_add(&rq->ipi_list, list))
1288
raise_softirq(BLOCK_SOFTIRQ);
1289
preempt_enable();
1290
}
1291
1292
bool blk_mq_complete_request_remote(struct request *rq)
1293
{
1294
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
1295
1296
/*
1297
* For request which hctx has only one ctx mapping,
1298
* or a polled request, always complete locally,
1299
* it's pointless to redirect the completion.
1300
*/
1301
if ((rq->mq_hctx->nr_ctx == 1 &&
1302
rq->mq_ctx->cpu == raw_smp_processor_id()) ||
1303
rq->cmd_flags & REQ_POLLED)
1304
return false;
1305
1306
if (blk_mq_complete_need_ipi(rq)) {
1307
blk_mq_complete_send_ipi(rq);
1308
return true;
1309
}
1310
1311
if (rq->q->nr_hw_queues == 1) {
1312
blk_mq_raise_softirq(rq);
1313
return true;
1314
}
1315
return false;
1316
}
1317
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
1318
1319
/**
1320
* blk_mq_complete_request - end I/O on a request
1321
* @rq: the request being processed
1322
*
1323
* Description:
1324
* Complete a request by scheduling the ->complete_rq operation.
1325
**/
1326
void blk_mq_complete_request(struct request *rq)
1327
{
1328
if (!blk_mq_complete_request_remote(rq))
1329
rq->q->mq_ops->complete(rq);
1330
}
1331
EXPORT_SYMBOL(blk_mq_complete_request);
1332
1333
/**
1334
* blk_mq_start_request - Start processing a request
1335
* @rq: Pointer to request to be started
1336
*
1337
* Function used by device drivers to notify the block layer that a request
1338
* is going to be processed now, so blk layer can do proper initializations
1339
* such as starting the timeout timer.
1340
*/
1341
void blk_mq_start_request(struct request *rq)
1342
{
1343
struct request_queue *q = rq->q;
1344
1345
trace_block_rq_issue(rq);
1346
1347
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
1348
!blk_rq_is_passthrough(rq)) {
1349
rq->io_start_time_ns = blk_time_get_ns();
1350
rq->stats_sectors = blk_rq_sectors(rq);
1351
rq->rq_flags |= RQF_STATS;
1352
rq_qos_issue(q, rq);
1353
}
1354
1355
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
1356
1357
blk_add_timer(rq);
1358
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
1359
rq->mq_hctx->tags->rqs[rq->tag] = rq;
1360
1361
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
1362
blk_integrity_prepare(rq);
1363
1364
if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
1365
WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
1366
}
1367
EXPORT_SYMBOL(blk_mq_start_request);
1368
1369
/*
1370
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
1371
* queues. This is important for md arrays to benefit from merging
1372
* requests.
1373
*/
1374
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
1375
{
1376
if (plug->multiple_queues)
1377
return BLK_MAX_REQUEST_COUNT * 2;
1378
return BLK_MAX_REQUEST_COUNT;
1379
}
1380
1381
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
1382
{
1383
struct request *last = rq_list_peek(&plug->mq_list);
1384
1385
if (!plug->rq_count) {
1386
trace_block_plug(rq->q);
1387
} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
1388
(!blk_queue_nomerges(rq->q) &&
1389
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1390
blk_mq_flush_plug_list(plug, false);
1391
last = NULL;
1392
trace_block_plug(rq->q);
1393
}
1394
1395
if (!plug->multiple_queues && last && last->q != rq->q)
1396
plug->multiple_queues = true;
1397
/*
1398
* Any request allocated from sched tags can't be issued to
1399
* ->queue_rqs() directly
1400
*/
1401
if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
1402
plug->has_elevator = true;
1403
rq_list_add_tail(&plug->mq_list, rq);
1404
plug->rq_count++;
1405
}
1406
1407
/**
1408
* blk_execute_rq_nowait - insert a request to I/O scheduler for execution
1409
* @rq: request to insert
1410
* @at_head: insert request at head or tail of queue
1411
*
1412
* Description:
1413
* Insert a fully prepared request at the back of the I/O scheduler queue
1414
* for execution. Don't wait for completion.
1415
*
1416
* Note:
1417
* This function will invoke @done directly if the queue is dead.
1418
*/
1419
void blk_execute_rq_nowait(struct request *rq, bool at_head)
1420
{
1421
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1422
1423
WARN_ON(irqs_disabled());
1424
WARN_ON(!blk_rq_is_passthrough(rq));
1425
1426
blk_account_io_start(rq);
1427
1428
if (current->plug && !at_head) {
1429
blk_add_rq_to_plug(current->plug, rq);
1430
return;
1431
}
1432
1433
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
1434
blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
1435
}
1436
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
1437
1438
struct blk_rq_wait {
1439
struct completion done;
1440
blk_status_t ret;
1441
};
1442
1443
static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
1444
{
1445
struct blk_rq_wait *wait = rq->end_io_data;
1446
1447
wait->ret = ret;
1448
complete(&wait->done);
1449
return RQ_END_IO_NONE;
1450
}
1451
1452
bool blk_rq_is_poll(struct request *rq)
1453
{
1454
if (!rq->mq_hctx)
1455
return false;
1456
if (rq->mq_hctx->type != HCTX_TYPE_POLL)
1457
return false;
1458
return true;
1459
}
1460
EXPORT_SYMBOL_GPL(blk_rq_is_poll);
1461
1462
static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
1463
{
1464
do {
1465
blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
1466
cond_resched();
1467
} while (!completion_done(wait));
1468
}
1469
1470
/**
1471
* blk_execute_rq - insert a request into queue for execution
1472
* @rq: request to insert
1473
* @at_head: insert request at head or tail of queue
1474
*
1475
* Description:
1476
* Insert a fully prepared request at the back of the I/O scheduler queue
1477
* for execution and wait for completion.
1478
* Return: The blk_status_t result provided to blk_mq_end_request().
1479
*/
1480
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
1481
{
1482
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1483
struct blk_rq_wait wait = {
1484
.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
1485
};
1486
1487
WARN_ON(irqs_disabled());
1488
WARN_ON(!blk_rq_is_passthrough(rq));
1489
1490
rq->end_io_data = &wait;
1491
rq->end_io = blk_end_sync_rq;
1492
1493
blk_account_io_start(rq);
1494
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
1495
blk_mq_run_hw_queue(hctx, false);
1496
1497
if (blk_rq_is_poll(rq))
1498
blk_rq_poll_completion(rq, &wait.done);
1499
else
1500
blk_wait_io(&wait.done);
1501
1502
return wait.ret;
1503
}
1504
EXPORT_SYMBOL(blk_execute_rq);
1505
1506
static void __blk_mq_requeue_request(struct request *rq)
1507
{
1508
struct request_queue *q = rq->q;
1509
1510
blk_mq_put_driver_tag(rq);
1511
1512
trace_block_rq_requeue(rq);
1513
rq_qos_requeue(q, rq);
1514
1515
if (blk_mq_request_started(rq)) {
1516
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1517
rq->rq_flags &= ~RQF_TIMED_OUT;
1518
}
1519
}
1520
1521
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
1522
{
1523
struct request_queue *q = rq->q;
1524
unsigned long flags;
1525
1526
__blk_mq_requeue_request(rq);
1527
1528
/* this request will be re-inserted to io scheduler queue */
1529
blk_mq_sched_requeue_request(rq);
1530
1531
spin_lock_irqsave(&q->requeue_lock, flags);
1532
list_add_tail(&rq->queuelist, &q->requeue_list);
1533
spin_unlock_irqrestore(&q->requeue_lock, flags);
1534
1535
if (kick_requeue_list)
1536
blk_mq_kick_requeue_list(q);
1537
}
1538
EXPORT_SYMBOL(blk_mq_requeue_request);
1539
1540
static void blk_mq_requeue_work(struct work_struct *work)
1541
{
1542
struct request_queue *q =
1543
container_of(work, struct request_queue, requeue_work.work);
1544
LIST_HEAD(rq_list);
1545
LIST_HEAD(flush_list);
1546
struct request *rq;
1547
1548
spin_lock_irq(&q->requeue_lock);
1549
list_splice_init(&q->requeue_list, &rq_list);
1550
list_splice_init(&q->flush_list, &flush_list);
1551
spin_unlock_irq(&q->requeue_lock);
1552
1553
while (!list_empty(&rq_list)) {
1554
rq = list_entry(rq_list.next, struct request, queuelist);
1555
list_del_init(&rq->queuelist);
1556
/*
1557
* If RQF_DONTPREP is set, the request has been started by the
1558
* driver already and might have driver-specific data allocated
1559
* already. Insert it into the hctx dispatch list to avoid
1560
* block layer merges for the request.
1561
*/
1562
if (rq->rq_flags & RQF_DONTPREP)
1563
blk_mq_request_bypass_insert(rq, 0);
1564
else
1565
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
1566
}
1567
1568
while (!list_empty(&flush_list)) {
1569
rq = list_entry(flush_list.next, struct request, queuelist);
1570
list_del_init(&rq->queuelist);
1571
blk_mq_insert_request(rq, 0);
1572
}
1573
1574
blk_mq_run_hw_queues(q, false);
1575
}
1576
1577
void blk_mq_kick_requeue_list(struct request_queue *q)
1578
{
1579
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
1580
}
1581
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
1582
1583
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
1584
unsigned long msecs)
1585
{
1586
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
1587
msecs_to_jiffies(msecs));
1588
}
1589
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
1590
1591
static bool blk_is_flush_data_rq(struct request *rq)
1592
{
1593
return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
1594
}
1595
1596
static bool blk_mq_rq_inflight(struct request *rq, void *priv)
1597
{
1598
/*
1599
* If we find a request that isn't idle we know the queue is busy
1600
* as it's checked in the iter.
1601
* Return false to stop the iteration.
1602
*
1603
* In case of queue quiesce, if one flush data request is completed,
1604
* don't count it as inflight given the flush sequence is suspended,
1605
* and the original flush data request is invisible to driver, just
1606
* like other pending requests because of quiesce
1607
*/
1608
if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
1609
blk_is_flush_data_rq(rq) &&
1610
blk_mq_request_completed(rq))) {
1611
bool *busy = priv;
1612
1613
*busy = true;
1614
return false;
1615
}
1616
1617
return true;
1618
}
1619
1620
bool blk_mq_queue_inflight(struct request_queue *q)
1621
{
1622
bool busy = false;
1623
1624
blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
1625
return busy;
1626
}
1627
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
1628
1629
static void blk_mq_rq_timed_out(struct request *req)
1630
{
1631
req->rq_flags |= RQF_TIMED_OUT;
1632
if (req->q->mq_ops->timeout) {
1633
enum blk_eh_timer_return ret;
1634
1635
ret = req->q->mq_ops->timeout(req);
1636
if (ret == BLK_EH_DONE)
1637
return;
1638
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
1639
}
1640
1641
blk_add_timer(req);
1642
}
1643
1644
struct blk_expired_data {
1645
bool has_timedout_rq;
1646
unsigned long next;
1647
unsigned long timeout_start;
1648
};
1649
1650
static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
1651
{
1652
unsigned long deadline;
1653
1654
if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
1655
return false;
1656
if (rq->rq_flags & RQF_TIMED_OUT)
1657
return false;
1658
1659
deadline = READ_ONCE(rq->deadline);
1660
if (time_after_eq(expired->timeout_start, deadline))
1661
return true;
1662
1663
if (expired->next == 0)
1664
expired->next = deadline;
1665
else if (time_after(expired->next, deadline))
1666
expired->next = deadline;
1667
return false;
1668
}
1669
1670
void blk_mq_put_rq_ref(struct request *rq)
1671
{
1672
if (is_flush_rq(rq)) {
1673
if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
1674
blk_mq_free_request(rq);
1675
} else if (req_ref_put_and_test(rq)) {
1676
__blk_mq_free_request(rq);
1677
}
1678
}
1679
1680
static bool blk_mq_check_expired(struct request *rq, void *priv)
1681
{
1682
struct blk_expired_data *expired = priv;
1683
1684
/*
1685
* blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
1686
* be reallocated underneath the timeout handler's processing, then
1687
* the expire check is reliable. If the request is not expired, then
1688
* it was completed and reallocated as a new request after returning
1689
* from blk_mq_check_expired().
1690
*/
1691
if (blk_mq_req_expired(rq, expired)) {
1692
expired->has_timedout_rq = true;
1693
return false;
1694
}
1695
return true;
1696
}
1697
1698
static bool blk_mq_handle_expired(struct request *rq, void *priv)
1699
{
1700
struct blk_expired_data *expired = priv;
1701
1702
if (blk_mq_req_expired(rq, expired))
1703
blk_mq_rq_timed_out(rq);
1704
return true;
1705
}
1706
1707
static void blk_mq_timeout_work(struct work_struct *work)
1708
{
1709
struct request_queue *q =
1710
container_of(work, struct request_queue, timeout_work);
1711
struct blk_expired_data expired = {
1712
.timeout_start = jiffies,
1713
};
1714
struct blk_mq_hw_ctx *hctx;
1715
unsigned long i;
1716
1717
/* A deadlock might occur if a request is stuck requiring a
1718
* timeout at the same time a queue freeze is waiting
1719
* completion, since the timeout code would not be able to
1720
* acquire the queue reference here.
1721
*
1722
* That's why we don't use blk_queue_enter here; instead, we use
1723
* percpu_ref_tryget directly, because we need to be able to
1724
* obtain a reference even in the short window between the queue
1725
* starting to freeze, by dropping the first reference in
1726
* blk_freeze_queue_start, and the moment the last request is
1727
* consumed, marked by the instant q_usage_counter reaches
1728
* zero.
1729
*/
1730
if (!percpu_ref_tryget(&q->q_usage_counter))
1731
return;
1732
1733
/* check if there is any timed-out request */
1734
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
1735
if (expired.has_timedout_rq) {
1736
/*
1737
* Before walking tags, we must ensure any submit started
1738
* before the current time has finished. Since the submit
1739
* uses srcu or rcu, wait for a synchronization point to
1740
* ensure all running submits have finished
1741
*/
1742
blk_mq_wait_quiesce_done(q->tag_set);
1743
1744
expired.next = 0;
1745
blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
1746
}
1747
1748
if (expired.next != 0) {
1749
mod_timer(&q->timeout, expired.next);
1750
} else {
1751
/*
1752
* Request timeouts are handled as a forward rolling timer. If
1753
* we end up here it means that no requests are pending and
1754
* also that no request has been pending for a while. Mark
1755
* each hctx as idle.
1756
*/
1757
queue_for_each_hw_ctx(q, hctx, i) {
1758
/* the hctx may be unmapped, so check it here */
1759
if (blk_mq_hw_queue_mapped(hctx))
1760
blk_mq_tag_idle(hctx);
1761
}
1762
}
1763
blk_queue_exit(q);
1764
}
1765
1766
struct flush_busy_ctx_data {
1767
struct blk_mq_hw_ctx *hctx;
1768
struct list_head *list;
1769
};
1770
1771
static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
1772
{
1773
struct flush_busy_ctx_data *flush_data = data;
1774
struct blk_mq_hw_ctx *hctx = flush_data->hctx;
1775
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1776
enum hctx_type type = hctx->type;
1777
1778
spin_lock(&ctx->lock);
1779
list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
1780
sbitmap_clear_bit(sb, bitnr);
1781
spin_unlock(&ctx->lock);
1782
return true;
1783
}
1784
1785
/*
1786
* Process software queues that have been marked busy, splicing them
1787
* to the for-dispatch
1788
*/
1789
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1790
{
1791
struct flush_busy_ctx_data data = {
1792
.hctx = hctx,
1793
.list = list,
1794
};
1795
1796
sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1797
}
1798
1799
struct dispatch_rq_data {
1800
struct blk_mq_hw_ctx *hctx;
1801
struct request *rq;
1802
};
1803
1804
static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1805
void *data)
1806
{
1807
struct dispatch_rq_data *dispatch_data = data;
1808
struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1809
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1810
enum hctx_type type = hctx->type;
1811
1812
spin_lock(&ctx->lock);
1813
if (!list_empty(&ctx->rq_lists[type])) {
1814
dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
1815
list_del_init(&dispatch_data->rq->queuelist);
1816
if (list_empty(&ctx->rq_lists[type]))
1817
sbitmap_clear_bit(sb, bitnr);
1818
}
1819
spin_unlock(&ctx->lock);
1820
1821
return !dispatch_data->rq;
1822
}
1823
1824
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1825
struct blk_mq_ctx *start)
1826
{
1827
unsigned off = start ? start->index_hw[hctx->type] : 0;
1828
struct dispatch_rq_data data = {
1829
.hctx = hctx,
1830
.rq = NULL,
1831
};
1832
1833
__sbitmap_for_each_set(&hctx->ctx_map, off,
1834
dispatch_rq_from_ctx, &data);
1835
1836
return data.rq;
1837
}
1838
1839
bool __blk_mq_alloc_driver_tag(struct request *rq)
1840
{
1841
struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
1842
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1843
int tag;
1844
1845
blk_mq_tag_busy(rq->mq_hctx);
1846
1847
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1848
bt = &rq->mq_hctx->tags->breserved_tags;
1849
tag_offset = 0;
1850
} else {
1851
if (!hctx_may_queue(rq->mq_hctx, bt))
1852
return false;
1853
}
1854
1855
tag = __sbitmap_queue_get(bt);
1856
if (tag == BLK_MQ_NO_TAG)
1857
return false;
1858
1859
rq->tag = tag + tag_offset;
1860
blk_mq_inc_active_requests(rq->mq_hctx);
1861
return true;
1862
}
1863
1864
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1865
int flags, void *key)
1866
{
1867
struct blk_mq_hw_ctx *hctx;
1868
1869
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1870
1871
spin_lock(&hctx->dispatch_wait_lock);
1872
if (!list_empty(&wait->entry)) {
1873
struct sbitmap_queue *sbq;
1874
1875
list_del_init(&wait->entry);
1876
sbq = &hctx->tags->bitmap_tags;
1877
atomic_dec(&sbq->ws_active);
1878
}
1879
spin_unlock(&hctx->dispatch_wait_lock);
1880
1881
blk_mq_run_hw_queue(hctx, true);
1882
return 1;
1883
}
1884
1885
/*
1886
* Mark us waiting for a tag. For shared tags, this involves hooking us into
1887
* the tag wakeups. For non-shared tags, we can simply mark us needing a
1888
* restart. For both cases, take care to check the condition again after
1889
* marking us as waiting.
1890
*/
1891
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1892
struct request *rq)
1893
{
1894
struct sbitmap_queue *sbq;
1895
struct wait_queue_head *wq;
1896
wait_queue_entry_t *wait;
1897
bool ret;
1898
1899
if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1900
!(blk_mq_is_shared_tags(hctx->flags))) {
1901
blk_mq_sched_mark_restart_hctx(hctx);
1902
1903
/*
1904
* It's possible that a tag was freed in the window between the
1905
* allocation failure and adding the hardware queue to the wait
1906
* queue.
1907
*
1908
* Don't clear RESTART here, someone else could have set it.
1909
* At most this will cost an extra queue run.
1910
*/
1911
return blk_mq_get_driver_tag(rq);
1912
}
1913
1914
wait = &hctx->dispatch_wait;
1915
if (!list_empty_careful(&wait->entry))
1916
return false;
1917
1918
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag))
1919
sbq = &hctx->tags->breserved_tags;
1920
else
1921
sbq = &hctx->tags->bitmap_tags;
1922
wq = &bt_wait_ptr(sbq, hctx)->wait;
1923
1924
spin_lock_irq(&wq->lock);
1925
spin_lock(&hctx->dispatch_wait_lock);
1926
if (!list_empty(&wait->entry)) {
1927
spin_unlock(&hctx->dispatch_wait_lock);
1928
spin_unlock_irq(&wq->lock);
1929
return false;
1930
}
1931
1932
atomic_inc(&sbq->ws_active);
1933
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1934
__add_wait_queue(wq, wait);
1935
1936
/*
1937
* Add one explicit barrier since blk_mq_get_driver_tag() may
1938
* not imply barrier in case of failure.
1939
*
1940
* Order adding us to wait queue and allocating driver tag.
1941
*
1942
* The pair is the one implied in sbitmap_queue_wake_up() which
1943
* orders clearing sbitmap tag bits and waitqueue_active() in
1944
* __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
1945
*
1946
* Otherwise, re-order of adding wait queue and getting driver tag
1947
* may cause __sbitmap_queue_wake_up() to wake up nothing because
1948
* the waitqueue_active() may not observe us in wait queue.
1949
*/
1950
smp_mb();
1951
1952
/*
1953
* It's possible that a tag was freed in the window between the
1954
* allocation failure and adding the hardware queue to the wait
1955
* queue.
1956
*/
1957
ret = blk_mq_get_driver_tag(rq);
1958
if (!ret) {
1959
spin_unlock(&hctx->dispatch_wait_lock);
1960
spin_unlock_irq(&wq->lock);
1961
return false;
1962
}
1963
1964
/*
1965
* We got a tag, remove ourselves from the wait queue to ensure
1966
* someone else gets the wakeup.
1967
*/
1968
list_del_init(&wait->entry);
1969
atomic_dec(&sbq->ws_active);
1970
spin_unlock(&hctx->dispatch_wait_lock);
1971
spin_unlock_irq(&wq->lock);
1972
1973
return true;
1974
}
1975
1976
#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
1977
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
1978
/*
1979
* Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1980
* - EWMA is one simple way to compute running average value
1981
* - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1982
* - take 4 as factor for avoiding to get too small(0) result, and this
1983
* factor doesn't matter because EWMA decreases exponentially
1984
*/
1985
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1986
{
1987
unsigned int ewma;
1988
1989
ewma = hctx->dispatch_busy;
1990
1991
if (!ewma && !busy)
1992
return;
1993
1994
ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1995
if (busy)
1996
ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1997
ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1998
1999
hctx->dispatch_busy = ewma;
2000
}
2001
2002
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
2003
2004
static void blk_mq_handle_dev_resource(struct request *rq,
2005
struct list_head *list)
2006
{
2007
list_add(&rq->queuelist, list);
2008
__blk_mq_requeue_request(rq);
2009
}
2010
2011
enum prep_dispatch {
2012
PREP_DISPATCH_OK,
2013
PREP_DISPATCH_NO_TAG,
2014
PREP_DISPATCH_NO_BUDGET,
2015
};
2016
2017
static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
2018
bool need_budget)
2019
{
2020
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2021
int budget_token = -1;
2022
2023
if (need_budget) {
2024
budget_token = blk_mq_get_dispatch_budget(rq->q);
2025
if (budget_token < 0) {
2026
blk_mq_put_driver_tag(rq);
2027
return PREP_DISPATCH_NO_BUDGET;
2028
}
2029
blk_mq_set_rq_budget_token(rq, budget_token);
2030
}
2031
2032
if (!blk_mq_get_driver_tag(rq)) {
2033
/*
2034
* The initial allocation attempt failed, so we need to
2035
* rerun the hardware queue when a tag is freed. The
2036
* waitqueue takes care of that. If the queue is run
2037
* before we add this entry back on the dispatch list,
2038
* we'll re-run it below.
2039
*/
2040
if (!blk_mq_mark_tag_wait(hctx, rq)) {
2041
/*
2042
* All budgets not got from this function will be put
2043
* together during handling partial dispatch
2044
*/
2045
if (need_budget)
2046
blk_mq_put_dispatch_budget(rq->q, budget_token);
2047
return PREP_DISPATCH_NO_TAG;
2048
}
2049
}
2050
2051
return PREP_DISPATCH_OK;
2052
}
2053
2054
/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
2055
static void blk_mq_release_budgets(struct request_queue *q,
2056
struct list_head *list)
2057
{
2058
struct request *rq;
2059
2060
list_for_each_entry(rq, list, queuelist) {
2061
int budget_token = blk_mq_get_rq_budget_token(rq);
2062
2063
if (budget_token >= 0)
2064
blk_mq_put_dispatch_budget(q, budget_token);
2065
}
2066
}
2067
2068
/*
2069
* blk_mq_commit_rqs will notify driver using bd->last that there is no
2070
* more requests. (See comment in struct blk_mq_ops for commit_rqs for
2071
* details)
2072
* Attention, we should explicitly call this in unusual cases:
2073
* 1) did not queue everything initially scheduled to queue
2074
* 2) the last attempt to queue a request failed
2075
*/
2076
static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
2077
bool from_schedule)
2078
{
2079
if (hctx->queue->mq_ops->commit_rqs && queued) {
2080
trace_block_unplug(hctx->queue, queued, !from_schedule);
2081
hctx->queue->mq_ops->commit_rqs(hctx);
2082
}
2083
}
2084
2085
/*
2086
* Returns true if we did some work AND can potentially do more.
2087
*/
2088
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
2089
bool get_budget)
2090
{
2091
enum prep_dispatch prep;
2092
struct request_queue *q = hctx->queue;
2093
struct request *rq;
2094
int queued;
2095
blk_status_t ret = BLK_STS_OK;
2096
bool needs_resource = false;
2097
2098
if (list_empty(list))
2099
return false;
2100
2101
/*
2102
* Now process all the entries, sending them to the driver.
2103
*/
2104
queued = 0;
2105
do {
2106
struct blk_mq_queue_data bd;
2107
2108
rq = list_first_entry(list, struct request, queuelist);
2109
2110
WARN_ON_ONCE(hctx != rq->mq_hctx);
2111
prep = blk_mq_prep_dispatch_rq(rq, get_budget);
2112
if (prep != PREP_DISPATCH_OK)
2113
break;
2114
2115
list_del_init(&rq->queuelist);
2116
2117
bd.rq = rq;
2118
bd.last = list_empty(list);
2119
2120
ret = q->mq_ops->queue_rq(hctx, &bd);
2121
switch (ret) {
2122
case BLK_STS_OK:
2123
queued++;
2124
break;
2125
case BLK_STS_RESOURCE:
2126
needs_resource = true;
2127
fallthrough;
2128
case BLK_STS_DEV_RESOURCE:
2129
blk_mq_handle_dev_resource(rq, list);
2130
goto out;
2131
default:
2132
blk_mq_end_request(rq, ret);
2133
}
2134
} while (!list_empty(list));
2135
out:
2136
/* If we didn't flush the entire list, we could have told the driver
2137
* there was more coming, but that turned out to be a lie.
2138
*/
2139
if (!list_empty(list) || ret != BLK_STS_OK)
2140
blk_mq_commit_rqs(hctx, queued, false);
2141
2142
/*
2143
* Any items that need requeuing? Stuff them into hctx->dispatch,
2144
* that is where we will continue on next queue run.
2145
*/
2146
if (!list_empty(list)) {
2147
bool needs_restart;
2148
/* For non-shared tags, the RESTART check will suffice */
2149
bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
2150
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
2151
blk_mq_is_shared_tags(hctx->flags));
2152
2153
/*
2154
* If the caller allocated budgets, free the budgets of the
2155
* requests that have not yet been passed to the block driver.
2156
*/
2157
if (!get_budget)
2158
blk_mq_release_budgets(q, list);
2159
2160
spin_lock(&hctx->lock);
2161
list_splice_tail_init(list, &hctx->dispatch);
2162
spin_unlock(&hctx->lock);
2163
2164
/*
2165
* Order adding requests to hctx->dispatch and checking
2166
* SCHED_RESTART flag. The pair of this smp_mb() is the one
2167
* in blk_mq_sched_restart(). Avoid restart code path to
2168
* miss the new added requests to hctx->dispatch, meantime
2169
* SCHED_RESTART is observed here.
2170
*/
2171
smp_mb();
2172
2173
/*
2174
* If SCHED_RESTART was set by the caller of this function and
2175
* it is no longer set that means that it was cleared by another
2176
* thread and hence that a queue rerun is needed.
2177
*
2178
* If 'no_tag' is set, that means that we failed getting
2179
* a driver tag with an I/O scheduler attached. If our dispatch
2180
* waitqueue is no longer active, ensure that we run the queue
2181
* AFTER adding our entries back to the list.
2182
*
2183
* If no I/O scheduler has been configured it is possible that
2184
* the hardware queue got stopped and restarted before requests
2185
* were pushed back onto the dispatch list. Rerun the queue to
2186
* avoid starvation. Notes:
2187
* - blk_mq_run_hw_queue() checks whether or not a queue has
2188
* been stopped before rerunning a queue.
2189
* - Some but not all block drivers stop a queue before
2190
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
2191
* and dm-rq.
2192
*
2193
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
2194
* bit is set, run queue after a delay to avoid IO stalls
2195
* that could otherwise occur if the queue is idle. We'll do
2196
* similar if we couldn't get budget or couldn't lock a zone
2197
* and SCHED_RESTART is set.
2198
*/
2199
needs_restart = blk_mq_sched_needs_restart(hctx);
2200
if (prep == PREP_DISPATCH_NO_BUDGET)
2201
needs_resource = true;
2202
if (!needs_restart ||
2203
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
2204
blk_mq_run_hw_queue(hctx, true);
2205
else if (needs_resource)
2206
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
2207
2208
blk_mq_update_dispatch_busy(hctx, true);
2209
return false;
2210
}
2211
2212
blk_mq_update_dispatch_busy(hctx, false);
2213
return true;
2214
}
2215
2216
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
2217
{
2218
int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
2219
2220
if (cpu >= nr_cpu_ids)
2221
cpu = cpumask_first(hctx->cpumask);
2222
return cpu;
2223
}
2224
2225
/*
2226
* ->next_cpu is always calculated from hctx->cpumask, so simply use
2227
* it for speeding up the check
2228
*/
2229
static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
2230
{
2231
return hctx->next_cpu >= nr_cpu_ids;
2232
}
2233
2234
/*
2235
* It'd be great if the workqueue API had a way to pass
2236
* in a mask and had some smarts for more clever placement.
2237
* For now we just round-robin here, switching for every
2238
* BLK_MQ_CPU_WORK_BATCH queued items.
2239
*/
2240
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
2241
{
2242
bool tried = false;
2243
int next_cpu = hctx->next_cpu;
2244
2245
/* Switch to unbound if no allowable CPUs in this hctx */
2246
if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
2247
return WORK_CPU_UNBOUND;
2248
2249
if (--hctx->next_cpu_batch <= 0) {
2250
select_cpu:
2251
next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
2252
cpu_online_mask);
2253
if (next_cpu >= nr_cpu_ids)
2254
next_cpu = blk_mq_first_mapped_cpu(hctx);
2255
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2256
}
2257
2258
/*
2259
* Do unbound schedule if we can't find a online CPU for this hctx,
2260
* and it should only happen in the path of handling CPU DEAD.
2261
*/
2262
if (!cpu_online(next_cpu)) {
2263
if (!tried) {
2264
tried = true;
2265
goto select_cpu;
2266
}
2267
2268
/*
2269
* Make sure to re-select CPU next time once after CPUs
2270
* in hctx->cpumask become online again.
2271
*/
2272
hctx->next_cpu = next_cpu;
2273
hctx->next_cpu_batch = 1;
2274
return WORK_CPU_UNBOUND;
2275
}
2276
2277
hctx->next_cpu = next_cpu;
2278
return next_cpu;
2279
}
2280
2281
/**
2282
* blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
2283
* @hctx: Pointer to the hardware queue to run.
2284
* @msecs: Milliseconds of delay to wait before running the queue.
2285
*
2286
* Run a hardware queue asynchronously with a delay of @msecs.
2287
*/
2288
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
2289
{
2290
if (unlikely(blk_mq_hctx_stopped(hctx)))
2291
return;
2292
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
2293
msecs_to_jiffies(msecs));
2294
}
2295
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
2296
2297
static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
2298
{
2299
bool need_run;
2300
2301
/*
2302
* When queue is quiesced, we may be switching io scheduler, or
2303
* updating nr_hw_queues, or other things, and we can't run queue
2304
* any more, even blk_mq_hctx_has_pending() can't be called safely.
2305
*
2306
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
2307
* quiesced.
2308
*/
2309
__blk_mq_run_dispatch_ops(hctx->queue, false,
2310
need_run = !blk_queue_quiesced(hctx->queue) &&
2311
blk_mq_hctx_has_pending(hctx));
2312
return need_run;
2313
}
2314
2315
/**
2316
* blk_mq_run_hw_queue - Start to run a hardware queue.
2317
* @hctx: Pointer to the hardware queue to run.
2318
* @async: If we want to run the queue asynchronously.
2319
*
2320
* Check if the request queue is not in a quiesced state and if there are
2321
* pending requests to be sent. If this is true, run the queue to send requests
2322
* to hardware.
2323
*/
2324
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2325
{
2326
bool need_run;
2327
2328
/*
2329
* We can't run the queue inline with interrupts disabled.
2330
*/
2331
WARN_ON_ONCE(!async && in_interrupt());
2332
2333
might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
2334
2335
need_run = blk_mq_hw_queue_need_run(hctx);
2336
if (!need_run) {
2337
unsigned long flags;
2338
2339
/*
2340
* Synchronize with blk_mq_unquiesce_queue(), because we check
2341
* if hw queue is quiesced locklessly above, we need the use
2342
* ->queue_lock to make sure we see the up-to-date status to
2343
* not miss rerunning the hw queue.
2344
*/
2345
spin_lock_irqsave(&hctx->queue->queue_lock, flags);
2346
need_run = blk_mq_hw_queue_need_run(hctx);
2347
spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
2348
2349
if (!need_run)
2350
return;
2351
}
2352
2353
if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
2354
blk_mq_delay_run_hw_queue(hctx, 0);
2355
return;
2356
}
2357
2358
blk_mq_run_dispatch_ops(hctx->queue,
2359
blk_mq_sched_dispatch_requests(hctx));
2360
}
2361
EXPORT_SYMBOL(blk_mq_run_hw_queue);
2362
2363
/*
2364
* Return prefered queue to dispatch from (if any) for non-mq aware IO
2365
* scheduler.
2366
*/
2367
static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
2368
{
2369
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
2370
/*
2371
* If the IO scheduler does not respect hardware queues when
2372
* dispatching, we just don't bother with multiple HW queues and
2373
* dispatch from hctx for the current CPU since running multiple queues
2374
* just causes lock contention inside the scheduler and pointless cache
2375
* bouncing.
2376
*/
2377
struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
2378
2379
if (!blk_mq_hctx_stopped(hctx))
2380
return hctx;
2381
return NULL;
2382
}
2383
2384
/**
2385
* blk_mq_run_hw_queues - Run all hardware queues in a request queue.
2386
* @q: Pointer to the request queue to run.
2387
* @async: If we want to run the queue asynchronously.
2388
*/
2389
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
2390
{
2391
struct blk_mq_hw_ctx *hctx, *sq_hctx;
2392
unsigned long i;
2393
2394
sq_hctx = NULL;
2395
if (blk_queue_sq_sched(q))
2396
sq_hctx = blk_mq_get_sq_hctx(q);
2397
queue_for_each_hw_ctx(q, hctx, i) {
2398
if (blk_mq_hctx_stopped(hctx))
2399
continue;
2400
/*
2401
* Dispatch from this hctx either if there's no hctx preferred
2402
* by IO scheduler or if it has requests that bypass the
2403
* scheduler.
2404
*/
2405
if (!sq_hctx || sq_hctx == hctx ||
2406
!list_empty_careful(&hctx->dispatch))
2407
blk_mq_run_hw_queue(hctx, async);
2408
}
2409
}
2410
EXPORT_SYMBOL(blk_mq_run_hw_queues);
2411
2412
/**
2413
* blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
2414
* @q: Pointer to the request queue to run.
2415
* @msecs: Milliseconds of delay to wait before running the queues.
2416
*/
2417
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
2418
{
2419
struct blk_mq_hw_ctx *hctx, *sq_hctx;
2420
unsigned long i;
2421
2422
sq_hctx = NULL;
2423
if (blk_queue_sq_sched(q))
2424
sq_hctx = blk_mq_get_sq_hctx(q);
2425
queue_for_each_hw_ctx(q, hctx, i) {
2426
if (blk_mq_hctx_stopped(hctx))
2427
continue;
2428
/*
2429
* If there is already a run_work pending, leave the
2430
* pending delay untouched. Otherwise, a hctx can stall
2431
* if another hctx is re-delaying the other's work
2432
* before the work executes.
2433
*/
2434
if (delayed_work_pending(&hctx->run_work))
2435
continue;
2436
/*
2437
* Dispatch from this hctx either if there's no hctx preferred
2438
* by IO scheduler or if it has requests that bypass the
2439
* scheduler.
2440
*/
2441
if (!sq_hctx || sq_hctx == hctx ||
2442
!list_empty_careful(&hctx->dispatch))
2443
blk_mq_delay_run_hw_queue(hctx, msecs);
2444
}
2445
}
2446
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
2447
2448
/*
2449
* This function is often used for pausing .queue_rq() by driver when
2450
* there isn't enough resource or some conditions aren't satisfied, and
2451
* BLK_STS_RESOURCE is usually returned.
2452
*
2453
* We do not guarantee that dispatch can be drained or blocked
2454
* after blk_mq_stop_hw_queue() returns. Please use
2455
* blk_mq_quiesce_queue() for that requirement.
2456
*/
2457
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
2458
{
2459
cancel_delayed_work(&hctx->run_work);
2460
2461
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
2462
}
2463
EXPORT_SYMBOL(blk_mq_stop_hw_queue);
2464
2465
/*
2466
* This function is often used for pausing .queue_rq() by driver when
2467
* there isn't enough resource or some conditions aren't satisfied, and
2468
* BLK_STS_RESOURCE is usually returned.
2469
*
2470
* We do not guarantee that dispatch can be drained or blocked
2471
* after blk_mq_stop_hw_queues() returns. Please use
2472
* blk_mq_quiesce_queue() for that requirement.
2473
*/
2474
void blk_mq_stop_hw_queues(struct request_queue *q)
2475
{
2476
struct blk_mq_hw_ctx *hctx;
2477
unsigned long i;
2478
2479
queue_for_each_hw_ctx(q, hctx, i)
2480
blk_mq_stop_hw_queue(hctx);
2481
}
2482
EXPORT_SYMBOL(blk_mq_stop_hw_queues);
2483
2484
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
2485
{
2486
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2487
2488
blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
2489
}
2490
EXPORT_SYMBOL(blk_mq_start_hw_queue);
2491
2492
void blk_mq_start_hw_queues(struct request_queue *q)
2493
{
2494
struct blk_mq_hw_ctx *hctx;
2495
unsigned long i;
2496
2497
queue_for_each_hw_ctx(q, hctx, i)
2498
blk_mq_start_hw_queue(hctx);
2499
}
2500
EXPORT_SYMBOL(blk_mq_start_hw_queues);
2501
2502
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2503
{
2504
if (!blk_mq_hctx_stopped(hctx))
2505
return;
2506
2507
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2508
/*
2509
* Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
2510
* clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
2511
* list in the subsequent routine.
2512
*/
2513
smp_mb__after_atomic();
2514
blk_mq_run_hw_queue(hctx, async);
2515
}
2516
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
2517
2518
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
2519
{
2520
struct blk_mq_hw_ctx *hctx;
2521
unsigned long i;
2522
2523
queue_for_each_hw_ctx(q, hctx, i)
2524
blk_mq_start_stopped_hw_queue(hctx, async ||
2525
(hctx->flags & BLK_MQ_F_BLOCKING));
2526
}
2527
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
2528
2529
static void blk_mq_run_work_fn(struct work_struct *work)
2530
{
2531
struct blk_mq_hw_ctx *hctx =
2532
container_of(work, struct blk_mq_hw_ctx, run_work.work);
2533
2534
blk_mq_run_dispatch_ops(hctx->queue,
2535
blk_mq_sched_dispatch_requests(hctx));
2536
}
2537
2538
/**
2539
* blk_mq_request_bypass_insert - Insert a request at dispatch list.
2540
* @rq: Pointer to request to be inserted.
2541
* @flags: BLK_MQ_INSERT_*
2542
*
2543
* Should only be used carefully, when the caller knows we want to
2544
* bypass a potential IO scheduler on the target device.
2545
*/
2546
static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
2547
{
2548
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2549
2550
spin_lock(&hctx->lock);
2551
if (flags & BLK_MQ_INSERT_AT_HEAD)
2552
list_add(&rq->queuelist, &hctx->dispatch);
2553
else
2554
list_add_tail(&rq->queuelist, &hctx->dispatch);
2555
spin_unlock(&hctx->lock);
2556
}
2557
2558
static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
2559
struct blk_mq_ctx *ctx, struct list_head *list,
2560
bool run_queue_async)
2561
{
2562
struct request *rq;
2563
enum hctx_type type = hctx->type;
2564
2565
/*
2566
* Try to issue requests directly if the hw queue isn't busy to save an
2567
* extra enqueue & dequeue to the sw queue.
2568
*/
2569
if (!hctx->dispatch_busy && !run_queue_async) {
2570
blk_mq_run_dispatch_ops(hctx->queue,
2571
blk_mq_try_issue_list_directly(hctx, list));
2572
if (list_empty(list))
2573
goto out;
2574
}
2575
2576
/*
2577
* preemption doesn't flush plug list, so it's possible ctx->cpu is
2578
* offline now
2579
*/
2580
list_for_each_entry(rq, list, queuelist) {
2581
BUG_ON(rq->mq_ctx != ctx);
2582
trace_block_rq_insert(rq);
2583
if (rq->cmd_flags & REQ_NOWAIT)
2584
run_queue_async = true;
2585
}
2586
2587
spin_lock(&ctx->lock);
2588
list_splice_tail_init(list, &ctx->rq_lists[type]);
2589
blk_mq_hctx_mark_pending(hctx, ctx);
2590
spin_unlock(&ctx->lock);
2591
out:
2592
blk_mq_run_hw_queue(hctx, run_queue_async);
2593
}
2594
2595
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
2596
{
2597
struct request_queue *q = rq->q;
2598
struct blk_mq_ctx *ctx = rq->mq_ctx;
2599
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2600
2601
if (blk_rq_is_passthrough(rq)) {
2602
/*
2603
* Passthrough request have to be added to hctx->dispatch
2604
* directly. The device may be in a situation where it can't
2605
* handle FS request, and always returns BLK_STS_RESOURCE for
2606
* them, which gets them added to hctx->dispatch.
2607
*
2608
* If a passthrough request is required to unblock the queues,
2609
* and it is added to the scheduler queue, there is no chance to
2610
* dispatch it given we prioritize requests in hctx->dispatch.
2611
*/
2612
blk_mq_request_bypass_insert(rq, flags);
2613
} else if (req_op(rq) == REQ_OP_FLUSH) {
2614
/*
2615
* Firstly normal IO request is inserted to scheduler queue or
2616
* sw queue, meantime we add flush request to dispatch queue(
2617
* hctx->dispatch) directly and there is at most one in-flight
2618
* flush request for each hw queue, so it doesn't matter to add
2619
* flush request to tail or front of the dispatch queue.
2620
*
2621
* Secondly in case of NCQ, flush request belongs to non-NCQ
2622
* command, and queueing it will fail when there is any
2623
* in-flight normal IO request(NCQ command). When adding flush
2624
* rq to the front of hctx->dispatch, it is easier to introduce
2625
* extra time to flush rq's latency because of S_SCHED_RESTART
2626
* compared with adding to the tail of dispatch queue, then
2627
* chance of flush merge is increased, and less flush requests
2628
* will be issued to controller. It is observed that ~10% time
2629
* is saved in blktests block/004 on disk attached to AHCI/NCQ
2630
* drive when adding flush rq to the front of hctx->dispatch.
2631
*
2632
* Simply queue flush rq to the front of hctx->dispatch so that
2633
* intensive flush workloads can benefit in case of NCQ HW.
2634
*/
2635
blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
2636
} else if (q->elevator) {
2637
LIST_HEAD(list);
2638
2639
WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);
2640
2641
list_add(&rq->queuelist, &list);
2642
q->elevator->type->ops.insert_requests(hctx, &list, flags);
2643
} else {
2644
trace_block_rq_insert(rq);
2645
2646
spin_lock(&ctx->lock);
2647
if (flags & BLK_MQ_INSERT_AT_HEAD)
2648
list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
2649
else
2650
list_add_tail(&rq->queuelist,
2651
&ctx->rq_lists[hctx->type]);
2652
blk_mq_hctx_mark_pending(hctx, ctx);
2653
spin_unlock(&ctx->lock);
2654
}
2655
}
2656
2657
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
2658
unsigned int nr_segs)
2659
{
2660
int err;
2661
2662
if (bio->bi_opf & REQ_RAHEAD)
2663
rq->cmd_flags |= REQ_FAILFAST_MASK;
2664
2665
rq->bio = rq->biotail = bio;
2666
rq->__sector = bio->bi_iter.bi_sector;
2667
rq->__data_len = bio->bi_iter.bi_size;
2668
rq->nr_phys_segments = nr_segs;
2669
if (bio_integrity(bio))
2670
rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
2671
bio);
2672
2673
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
2674
err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
2675
WARN_ON_ONCE(err);
2676
2677
blk_account_io_start(rq);
2678
}
2679
2680
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
2681
struct request *rq, bool last)
2682
{
2683
struct request_queue *q = rq->q;
2684
struct blk_mq_queue_data bd = {
2685
.rq = rq,
2686
.last = last,
2687
};
2688
blk_status_t ret;
2689
2690
/*
2691
* For OK queue, we are done. For error, caller may kill it.
2692
* Any other error (busy), just add it to our list as we
2693
* previously would have done.
2694
*/
2695
ret = q->mq_ops->queue_rq(hctx, &bd);
2696
switch (ret) {
2697
case BLK_STS_OK:
2698
blk_mq_update_dispatch_busy(hctx, false);
2699
break;
2700
case BLK_STS_RESOURCE:
2701
case BLK_STS_DEV_RESOURCE:
2702
blk_mq_update_dispatch_busy(hctx, true);
2703
__blk_mq_requeue_request(rq);
2704
break;
2705
default:
2706
blk_mq_update_dispatch_busy(hctx, false);
2707
break;
2708
}
2709
2710
return ret;
2711
}
2712
2713
static bool blk_mq_get_budget_and_tag(struct request *rq)
2714
{
2715
int budget_token;
2716
2717
budget_token = blk_mq_get_dispatch_budget(rq->q);
2718
if (budget_token < 0)
2719
return false;
2720
blk_mq_set_rq_budget_token(rq, budget_token);
2721
if (!blk_mq_get_driver_tag(rq)) {
2722
blk_mq_put_dispatch_budget(rq->q, budget_token);
2723
return false;
2724
}
2725
return true;
2726
}
2727
2728
/**
2729
* blk_mq_try_issue_directly - Try to send a request directly to device driver.
2730
* @hctx: Pointer of the associated hardware queue.
2731
* @rq: Pointer to request to be sent.
2732
*
2733
* If the device has enough resources to accept a new request now, send the
2734
* request directly to device driver. Else, insert at hctx->dispatch queue, so
2735
* we can try send it another time in the future. Requests inserted at this
2736
* queue have higher priority.
2737
*/
2738
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2739
struct request *rq)
2740
{
2741
blk_status_t ret;
2742
2743
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
2744
blk_mq_insert_request(rq, 0);
2745
blk_mq_run_hw_queue(hctx, false);
2746
return;
2747
}
2748
2749
if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
2750
blk_mq_insert_request(rq, 0);
2751
blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
2752
return;
2753
}
2754
2755
ret = __blk_mq_issue_directly(hctx, rq, true);
2756
switch (ret) {
2757
case BLK_STS_OK:
2758
break;
2759
case BLK_STS_RESOURCE:
2760
case BLK_STS_DEV_RESOURCE:
2761
blk_mq_request_bypass_insert(rq, 0);
2762
blk_mq_run_hw_queue(hctx, false);
2763
break;
2764
default:
2765
blk_mq_end_request(rq, ret);
2766
break;
2767
}
2768
}
2769
2770
static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
2771
{
2772
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2773
2774
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
2775
blk_mq_insert_request(rq, 0);
2776
blk_mq_run_hw_queue(hctx, false);
2777
return BLK_STS_OK;
2778
}
2779
2780
if (!blk_mq_get_budget_and_tag(rq))
2781
return BLK_STS_RESOURCE;
2782
return __blk_mq_issue_directly(hctx, rq, last);
2783
}
2784
2785
static void blk_mq_issue_direct(struct rq_list *rqs)
2786
{
2787
struct blk_mq_hw_ctx *hctx = NULL;
2788
struct request *rq;
2789
int queued = 0;
2790
blk_status_t ret = BLK_STS_OK;
2791
2792
while ((rq = rq_list_pop(rqs))) {
2793
bool last = rq_list_empty(rqs);
2794
2795
if (hctx != rq->mq_hctx) {
2796
if (hctx) {
2797
blk_mq_commit_rqs(hctx, queued, false);
2798
queued = 0;
2799
}
2800
hctx = rq->mq_hctx;
2801
}
2802
2803
ret = blk_mq_request_issue_directly(rq, last);
2804
switch (ret) {
2805
case BLK_STS_OK:
2806
queued++;
2807
break;
2808
case BLK_STS_RESOURCE:
2809
case BLK_STS_DEV_RESOURCE:
2810
blk_mq_request_bypass_insert(rq, 0);
2811
blk_mq_run_hw_queue(hctx, false);
2812
goto out;
2813
default:
2814
blk_mq_end_request(rq, ret);
2815
break;
2816
}
2817
}
2818
2819
out:
2820
if (ret != BLK_STS_OK)
2821
blk_mq_commit_rqs(hctx, queued, false);
2822
}
2823
2824
static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
2825
{
2826
if (blk_queue_quiesced(q))
2827
return;
2828
q->mq_ops->queue_rqs(rqs);
2829
}
2830
2831
static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
2832
struct rq_list *queue_rqs)
2833
{
2834
struct request *rq = rq_list_pop(rqs);
2835
struct request_queue *this_q = rq->q;
2836
struct request **prev = &rqs->head;
2837
struct rq_list matched_rqs = {};
2838
struct request *last = NULL;
2839
unsigned depth = 1;
2840
2841
rq_list_add_tail(&matched_rqs, rq);
2842
while ((rq = *prev)) {
2843
if (rq->q == this_q) {
2844
/* move rq from rqs to matched_rqs */
2845
*prev = rq->rq_next;
2846
rq_list_add_tail(&matched_rqs, rq);
2847
depth++;
2848
} else {
2849
/* leave rq in rqs */
2850
prev = &rq->rq_next;
2851
last = rq;
2852
}
2853
}
2854
2855
rqs->tail = last;
2856
*queue_rqs = matched_rqs;
2857
return depth;
2858
}
2859
2860
static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
2861
{
2862
struct request_queue *q = rq_list_peek(rqs)->q;
2863
2864
trace_block_unplug(q, depth, true);
2865
2866
/*
2867
* Peek first request and see if we have a ->queue_rqs() hook.
2868
* If we do, we can dispatch the whole list in one go.
2869
* We already know at this point that all requests belong to the
2870
* same queue, caller must ensure that's the case.
2871
*/
2872
if (q->mq_ops->queue_rqs) {
2873
blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
2874
if (rq_list_empty(rqs))
2875
return;
2876
}
2877
2878
blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
2879
}
2880
2881
static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
2882
{
2883
struct blk_mq_hw_ctx *this_hctx = NULL;
2884
struct blk_mq_ctx *this_ctx = NULL;
2885
struct rq_list requeue_list = {};
2886
unsigned int depth = 0;
2887
bool is_passthrough = false;
2888
LIST_HEAD(list);
2889
2890
do {
2891
struct request *rq = rq_list_pop(rqs);
2892
2893
if (!this_hctx) {
2894
this_hctx = rq->mq_hctx;
2895
this_ctx = rq->mq_ctx;
2896
is_passthrough = blk_rq_is_passthrough(rq);
2897
} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
2898
is_passthrough != blk_rq_is_passthrough(rq)) {
2899
rq_list_add_tail(&requeue_list, rq);
2900
continue;
2901
}
2902
list_add_tail(&rq->queuelist, &list);
2903
depth++;
2904
} while (!rq_list_empty(rqs));
2905
2906
*rqs = requeue_list;
2907
trace_block_unplug(this_hctx->queue, depth, !from_sched);
2908
2909
percpu_ref_get(&this_hctx->queue->q_usage_counter);
2910
/* passthrough requests should never be issued to the I/O scheduler */
2911
if (is_passthrough) {
2912
spin_lock(&this_hctx->lock);
2913
list_splice_tail_init(&list, &this_hctx->dispatch);
2914
spin_unlock(&this_hctx->lock);
2915
blk_mq_run_hw_queue(this_hctx, from_sched);
2916
} else if (this_hctx->queue->elevator) {
2917
this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
2918
&list, 0);
2919
blk_mq_run_hw_queue(this_hctx, from_sched);
2920
} else {
2921
blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
2922
}
2923
percpu_ref_put(&this_hctx->queue->q_usage_counter);
2924
}
2925
2926
static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
2927
{
2928
do {
2929
struct rq_list queue_rqs;
2930
unsigned depth;
2931
2932
depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
2933
blk_mq_dispatch_queue_requests(&queue_rqs, depth);
2934
while (!rq_list_empty(&queue_rqs))
2935
blk_mq_dispatch_list(&queue_rqs, false);
2936
} while (!rq_list_empty(rqs));
2937
}
2938
2939
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2940
{
2941
unsigned int depth;
2942
2943
/*
2944
* We may have been called recursively midway through handling
2945
* plug->mq_list via a schedule() in the driver's queue_rq() callback.
2946
* To avoid mq_list changing under our feet, clear rq_count early and
2947
* bail out specifically if rq_count is 0 rather than checking
2948
* whether the mq_list is empty.
2949
*/
2950
if (plug->rq_count == 0)
2951
return;
2952
depth = plug->rq_count;
2953
plug->rq_count = 0;
2954
2955
if (!plug->has_elevator && !from_schedule) {
2956
if (plug->multiple_queues) {
2957
blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
2958
return;
2959
}
2960
2961
blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
2962
if (rq_list_empty(&plug->mq_list))
2963
return;
2964
}
2965
2966
do {
2967
blk_mq_dispatch_list(&plug->mq_list, from_schedule);
2968
} while (!rq_list_empty(&plug->mq_list));
2969
}
2970
2971
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
2972
struct list_head *list)
2973
{
2974
int queued = 0;
2975
blk_status_t ret = BLK_STS_OK;
2976
2977
while (!list_empty(list)) {
2978
struct request *rq = list_first_entry(list, struct request,
2979
queuelist);
2980
2981
list_del_init(&rq->queuelist);
2982
ret = blk_mq_request_issue_directly(rq, list_empty(list));
2983
switch (ret) {
2984
case BLK_STS_OK:
2985
queued++;
2986
break;
2987
case BLK_STS_RESOURCE:
2988
case BLK_STS_DEV_RESOURCE:
2989
blk_mq_request_bypass_insert(rq, 0);
2990
if (list_empty(list))
2991
blk_mq_run_hw_queue(hctx, false);
2992
goto out;
2993
default:
2994
blk_mq_end_request(rq, ret);
2995
break;
2996
}
2997
}
2998
2999
out:
3000
if (ret != BLK_STS_OK)
3001
blk_mq_commit_rqs(hctx, queued, false);
3002
}
3003
3004
static bool blk_mq_attempt_bio_merge(struct request_queue *q,
3005
struct bio *bio, unsigned int nr_segs)
3006
{
3007
if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
3008
if (blk_attempt_plug_merge(q, bio, nr_segs))
3009
return true;
3010
if (blk_mq_sched_bio_merge(q, bio, nr_segs))
3011
return true;
3012
}
3013
return false;
3014
}
3015
3016
static struct request *blk_mq_get_new_requests(struct request_queue *q,
3017
struct blk_plug *plug,
3018
struct bio *bio)
3019
{
3020
struct blk_mq_alloc_data data = {
3021
.q = q,
3022
.flags = 0,
3023
.shallow_depth = 0,
3024
.cmd_flags = bio->bi_opf,
3025
.rq_flags = 0,
3026
.nr_tags = 1,
3027
.cached_rqs = NULL,
3028
.ctx = NULL,
3029
.hctx = NULL
3030
};
3031
struct request *rq;
3032
3033
rq_qos_throttle(q, bio);
3034
3035
if (plug) {
3036
data.nr_tags = plug->nr_ios;
3037
plug->nr_ios = 1;
3038
data.cached_rqs = &plug->cached_rqs;
3039
}
3040
3041
rq = __blk_mq_alloc_requests(&data);
3042
if (unlikely(!rq))
3043
rq_qos_cleanup(q, bio);
3044
return rq;
3045
}
3046
3047
/*
3048
* Check if there is a suitable cached request and return it.
3049
*/
3050
static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
3051
struct request_queue *q, blk_opf_t opf)
3052
{
3053
enum hctx_type type = blk_mq_get_hctx_type(opf);
3054
struct request *rq;
3055
3056
if (!plug)
3057
return NULL;
3058
rq = rq_list_peek(&plug->cached_rqs);
3059
if (!rq || rq->q != q)
3060
return NULL;
3061
if (type != rq->mq_hctx->type &&
3062
(type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
3063
return NULL;
3064
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
3065
return NULL;
3066
return rq;
3067
}
3068
3069
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
3070
struct bio *bio)
3071
{
3072
if (rq_list_pop(&plug->cached_rqs) != rq)
3073
WARN_ON_ONCE(1);
3074
3075
/*
3076
* If any qos ->throttle() end up blocking, we will have flushed the
3077
* plug and hence killed the cached_rq list as well. Pop this entry
3078
* before we throttle.
3079
*/
3080
rq_qos_throttle(rq->q, bio);
3081
3082
blk_mq_rq_time_init(rq, blk_time_get_ns());
3083
rq->cmd_flags = bio->bi_opf;
3084
INIT_LIST_HEAD(&rq->queuelist);
3085
}
3086
3087
static bool bio_unaligned(const struct bio *bio, struct request_queue *q)
3088
{
3089
unsigned int bs_mask = queue_logical_block_size(q) - 1;
3090
3091
/* .bi_sector of any zero sized bio need to be initialized */
3092
if ((bio->bi_iter.bi_size & bs_mask) ||
3093
((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask))
3094
return true;
3095
return false;
3096
}
3097
3098
/**
3099
* blk_mq_submit_bio - Create and send a request to block device.
3100
* @bio: Bio pointer.
3101
*
3102
* Builds up a request structure from @q and @bio and send to the device. The
3103
* request may not be queued directly to hardware if:
3104
* * This request can be merged with another one
3105
* * We want to place request at plug queue for possible future merging
3106
* * There is an IO scheduler active at this queue
3107
*
3108
* It will not queue the request if there is an error with the bio, or at the
3109
* request creation.
3110
*/
3111
void blk_mq_submit_bio(struct bio *bio)
3112
{
3113
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
3114
struct blk_plug *plug = current->plug;
3115
const int is_sync = op_is_sync(bio->bi_opf);
3116
struct blk_mq_hw_ctx *hctx;
3117
unsigned int nr_segs;
3118
struct request *rq;
3119
blk_status_t ret;
3120
3121
/*
3122
* If the plug has a cached request for this queue, try to use it.
3123
*/
3124
rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
3125
3126
/*
3127
* A BIO that was released from a zone write plug has already been
3128
* through the preparation in this function, already holds a reference
3129
* on the queue usage counter, and is the only write BIO in-flight for
3130
* the target zone. Go straight to preparing a request for it.
3131
*/
3132
if (bio_zone_write_plugging(bio)) {
3133
nr_segs = bio->__bi_nr_segments;
3134
if (rq)
3135
blk_queue_exit(q);
3136
goto new_request;
3137
}
3138
3139
/*
3140
* The cached request already holds a q_usage_counter reference and we
3141
* don't have to acquire a new one if we use it.
3142
*/
3143
if (!rq) {
3144
if (unlikely(bio_queue_enter(bio)))
3145
return;
3146
}
3147
3148
/*
3149
* Device reconfiguration may change logical block size or reduce the
3150
* number of poll queues, so the checks for alignment and poll support
3151
* have to be done with queue usage counter held.
3152
*/
3153
if (unlikely(bio_unaligned(bio, q))) {
3154
bio_io_error(bio);
3155
goto queue_exit;
3156
}
3157
3158
if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
3159
bio->bi_status = BLK_STS_NOTSUPP;
3160
bio_endio(bio);
3161
goto queue_exit;
3162
}
3163
3164
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
3165
if (!bio)
3166
goto queue_exit;
3167
3168
if (!bio_integrity_prep(bio))
3169
goto queue_exit;
3170
3171
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
3172
goto queue_exit;
3173
3174
if (bio_needs_zone_write_plugging(bio)) {
3175
if (blk_zone_plug_bio(bio, nr_segs))
3176
goto queue_exit;
3177
}
3178
3179
new_request:
3180
if (rq) {
3181
blk_mq_use_cached_rq(rq, plug, bio);
3182
} else {
3183
rq = blk_mq_get_new_requests(q, plug, bio);
3184
if (unlikely(!rq)) {
3185
if (bio->bi_opf & REQ_NOWAIT)
3186
bio_wouldblock_error(bio);
3187
goto queue_exit;
3188
}
3189
}
3190
3191
trace_block_getrq(bio);
3192
3193
rq_qos_track(q, rq, bio);
3194
3195
blk_mq_bio_to_request(rq, bio, nr_segs);
3196
3197
ret = blk_crypto_rq_get_keyslot(rq);
3198
if (ret != BLK_STS_OK) {
3199
bio->bi_status = ret;
3200
bio_endio(bio);
3201
blk_mq_free_request(rq);
3202
return;
3203
}
3204
3205
if (bio_zone_write_plugging(bio))
3206
blk_zone_write_plug_init_request(rq);
3207
3208
if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
3209
return;
3210
3211
if (plug) {
3212
blk_add_rq_to_plug(plug, rq);
3213
return;
3214
}
3215
3216
hctx = rq->mq_hctx;
3217
if ((rq->rq_flags & RQF_USE_SCHED) ||
3218
(hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
3219
blk_mq_insert_request(rq, 0);
3220
blk_mq_run_hw_queue(hctx, true);
3221
} else {
3222
blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
3223
}
3224
return;
3225
3226
queue_exit:
3227
/*
3228
* Don't drop the queue reference if we were trying to use a cached
3229
* request and thus didn't acquire one.
3230
*/
3231
if (!rq)
3232
blk_queue_exit(q);
3233
}
3234
3235
#ifdef CONFIG_BLK_MQ_STACKING
3236
/**
3237
* blk_insert_cloned_request - Helper for stacking drivers to submit a request
3238
* @rq: the request being queued
3239
*/
3240
blk_status_t blk_insert_cloned_request(struct request *rq)
3241
{
3242
struct request_queue *q = rq->q;
3243
unsigned int max_sectors = blk_queue_get_max_sectors(rq);
3244
unsigned int max_segments = blk_rq_get_max_segments(rq);
3245
blk_status_t ret;
3246
3247
if (blk_rq_sectors(rq) > max_sectors) {
3248
/*
3249
* SCSI device does not have a good way to return if
3250
* Write Same/Zero is actually supported. If a device rejects
3251
* a non-read/write command (discard, write same,etc.) the
3252
* low-level device driver will set the relevant queue limit to
3253
* 0 to prevent blk-lib from issuing more of the offending
3254
* operations. Commands queued prior to the queue limit being
3255
* reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
3256
* errors being propagated to upper layers.
3257
*/
3258
if (max_sectors == 0)
3259
return BLK_STS_NOTSUPP;
3260
3261
printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
3262
__func__, blk_rq_sectors(rq), max_sectors);
3263
return BLK_STS_IOERR;
3264
}
3265
3266
/*
3267
* The queue settings related to segment counting may differ from the
3268
* original queue.
3269
*/
3270
rq->nr_phys_segments = blk_recalc_rq_segments(rq);
3271
if (rq->nr_phys_segments > max_segments) {
3272
printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
3273
__func__, rq->nr_phys_segments, max_segments);
3274
return BLK_STS_IOERR;
3275
}
3276
3277
if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
3278
return BLK_STS_IOERR;
3279
3280
ret = blk_crypto_rq_get_keyslot(rq);
3281
if (ret != BLK_STS_OK)
3282
return ret;
3283
3284
blk_account_io_start(rq);
3285
3286
/*
3287
* Since we have a scheduler attached on the top device,
3288
* bypass a potential scheduler on the bottom device for
3289
* insert.
3290
*/
3291
blk_mq_run_dispatch_ops(q,
3292
ret = blk_mq_request_issue_directly(rq, true));
3293
if (ret)
3294
blk_account_io_done(rq, blk_time_get_ns());
3295
return ret;
3296
}
3297
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
3298
3299
/**
3300
* blk_rq_unprep_clone - Helper function to free all bios in a cloned request
3301
* @rq: the clone request to be cleaned up
3302
*
3303
* Description:
3304
* Free all bios in @rq for a cloned request.
3305
*/
3306
void blk_rq_unprep_clone(struct request *rq)
3307
{
3308
struct bio *bio;
3309
3310
while ((bio = rq->bio) != NULL) {
3311
rq->bio = bio->bi_next;
3312
3313
bio_put(bio);
3314
}
3315
}
3316
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
3317
3318
/**
3319
* blk_rq_prep_clone - Helper function to setup clone request
3320
* @rq: the request to be setup
3321
* @rq_src: original request to be cloned
3322
* @bs: bio_set that bios for clone are allocated from
3323
* @gfp_mask: memory allocation mask for bio
3324
* @bio_ctr: setup function to be called for each clone bio.
3325
* Returns %0 for success, non %0 for failure.
3326
* @data: private data to be passed to @bio_ctr
3327
*
3328
* Description:
3329
* Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
3330
* Also, pages which the original bios are pointing to are not copied
3331
* and the cloned bios just point same pages.
3332
* So cloned bios must be completed before original bios, which means
3333
* the caller must complete @rq before @rq_src.
3334
*/
3335
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
3336
struct bio_set *bs, gfp_t gfp_mask,
3337
int (*bio_ctr)(struct bio *, struct bio *, void *),
3338
void *data)
3339
{
3340
struct bio *bio_src;
3341
3342
if (!bs)
3343
bs = &fs_bio_set;
3344
3345
__rq_for_each_bio(bio_src, rq_src) {
3346
struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src,
3347
gfp_mask, bs);
3348
if (!bio)
3349
goto free_and_out;
3350
3351
if (bio_ctr && bio_ctr(bio, bio_src, data)) {
3352
bio_put(bio);
3353
goto free_and_out;
3354
}
3355
3356
if (rq->bio) {
3357
rq->biotail->bi_next = bio;
3358
rq->biotail = bio;
3359
} else {
3360
rq->bio = rq->biotail = bio;
3361
}
3362
}
3363
3364
/* Copy attributes of the original request to the clone request. */
3365
rq->__sector = blk_rq_pos(rq_src);
3366
rq->__data_len = blk_rq_bytes(rq_src);
3367
if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
3368
rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
3369
rq->special_vec = rq_src->special_vec;
3370
}
3371
rq->nr_phys_segments = rq_src->nr_phys_segments;
3372
rq->nr_integrity_segments = rq_src->nr_integrity_segments;
3373
3374
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
3375
goto free_and_out;
3376
3377
return 0;
3378
3379
free_and_out:
3380
blk_rq_unprep_clone(rq);
3381
3382
return -ENOMEM;
3383
}
3384
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
3385
#endif /* CONFIG_BLK_MQ_STACKING */
3386
3387
/*
3388
* Steal bios from a request and add them to a bio list.
3389
* The request must not have been partially completed before.
3390
*/
3391
void blk_steal_bios(struct bio_list *list, struct request *rq)
3392
{
3393
if (rq->bio) {
3394
if (list->tail)
3395
list->tail->bi_next = rq->bio;
3396
else
3397
list->head = rq->bio;
3398
list->tail = rq->biotail;
3399
3400
rq->bio = NULL;
3401
rq->biotail = NULL;
3402
}
3403
3404
rq->__data_len = 0;
3405
}
3406
EXPORT_SYMBOL_GPL(blk_steal_bios);
3407
3408
static size_t order_to_size(unsigned int order)
3409
{
3410
return (size_t)PAGE_SIZE << order;
3411
}
3412
3413
/* called before freeing request pool in @tags */
3414
static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
3415
struct blk_mq_tags *tags)
3416
{
3417
struct page *page;
3418
unsigned long flags;
3419
3420
/*
3421
* There is no need to clear mapping if driver tags is not initialized
3422
* or the mapping belongs to the driver tags.
3423
*/
3424
if (!drv_tags || drv_tags == tags)
3425
return;
3426
3427
list_for_each_entry(page, &tags->page_list, lru) {
3428
unsigned long start = (unsigned long)page_address(page);
3429
unsigned long end = start + order_to_size(page->private);
3430
int i;
3431
3432
for (i = 0; i < drv_tags->nr_tags; i++) {
3433
struct request *rq = drv_tags->rqs[i];
3434
unsigned long rq_addr = (unsigned long)rq;
3435
3436
if (rq_addr >= start && rq_addr < end) {
3437
WARN_ON_ONCE(req_ref_read(rq) != 0);
3438
cmpxchg(&drv_tags->rqs[i], rq, NULL);
3439
}
3440
}
3441
}
3442
3443
/*
3444
* Wait until all pending iteration is done.
3445
*
3446
* Request reference is cleared and it is guaranteed to be observed
3447
* after the ->lock is released.
3448
*/
3449
spin_lock_irqsave(&drv_tags->lock, flags);
3450
spin_unlock_irqrestore(&drv_tags->lock, flags);
3451
}
3452
3453
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
3454
unsigned int hctx_idx)
3455
{
3456
struct blk_mq_tags *drv_tags;
3457
struct page *page;
3458
3459
if (list_empty(&tags->page_list))
3460
return;
3461
3462
if (blk_mq_is_shared_tags(set->flags))
3463
drv_tags = set->shared_tags;
3464
else
3465
drv_tags = set->tags[hctx_idx];
3466
3467
if (tags->static_rqs && set->ops->exit_request) {
3468
int i;
3469
3470
for (i = 0; i < tags->nr_tags; i++) {
3471
struct request *rq = tags->static_rqs[i];
3472
3473
if (!rq)
3474
continue;
3475
set->ops->exit_request(set, rq, hctx_idx);
3476
tags->static_rqs[i] = NULL;
3477
}
3478
}
3479
3480
blk_mq_clear_rq_mapping(drv_tags, tags);
3481
3482
while (!list_empty(&tags->page_list)) {
3483
page = list_first_entry(&tags->page_list, struct page, lru);
3484
list_del_init(&page->lru);
3485
/*
3486
* Remove kmemleak object previously allocated in
3487
* blk_mq_alloc_rqs().
3488
*/
3489
kmemleak_free(page_address(page));
3490
__free_pages(page, page->private);
3491
}
3492
}
3493
3494
void blk_mq_free_rq_map(struct blk_mq_tags *tags)
3495
{
3496
kfree(tags->rqs);
3497
tags->rqs = NULL;
3498
kfree(tags->static_rqs);
3499
tags->static_rqs = NULL;
3500
3501
blk_mq_free_tags(tags);
3502
}
3503
3504
static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
3505
unsigned int hctx_idx)
3506
{
3507
int i;
3508
3509
for (i = 0; i < set->nr_maps; i++) {
3510
unsigned int start = set->map[i].queue_offset;
3511
unsigned int end = start + set->map[i].nr_queues;
3512
3513
if (hctx_idx >= start && hctx_idx < end)
3514
break;
3515
}
3516
3517
if (i >= set->nr_maps)
3518
i = HCTX_TYPE_DEFAULT;
3519
3520
return i;
3521
}
3522
3523
static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
3524
unsigned int hctx_idx)
3525
{
3526
enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
3527
3528
return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
3529
}
3530
3531
static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
3532
unsigned int hctx_idx,
3533
unsigned int nr_tags,
3534
unsigned int reserved_tags)
3535
{
3536
int node = blk_mq_get_hctx_node(set, hctx_idx);
3537
struct blk_mq_tags *tags;
3538
3539
if (node == NUMA_NO_NODE)
3540
node = set->numa_node;
3541
3542
tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node);
3543
if (!tags)
3544
return NULL;
3545
3546
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3547
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3548
node);
3549
if (!tags->rqs)
3550
goto err_free_tags;
3551
3552
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3553
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3554
node);
3555
if (!tags->static_rqs)
3556
goto err_free_rqs;
3557
3558
return tags;
3559
3560
err_free_rqs:
3561
kfree(tags->rqs);
3562
err_free_tags:
3563
blk_mq_free_tags(tags);
3564
return NULL;
3565
}
3566
3567
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
3568
unsigned int hctx_idx, int node)
3569
{
3570
int ret;
3571
3572
if (set->ops->init_request) {
3573
ret = set->ops->init_request(set, rq, hctx_idx, node);
3574
if (ret)
3575
return ret;
3576
}
3577
3578
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
3579
return 0;
3580
}
3581
3582
static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
3583
struct blk_mq_tags *tags,
3584
unsigned int hctx_idx, unsigned int depth)
3585
{
3586
unsigned int i, j, entries_per_page, max_order = 4;
3587
int node = blk_mq_get_hctx_node(set, hctx_idx);
3588
size_t rq_size, left;
3589
3590
if (node == NUMA_NO_NODE)
3591
node = set->numa_node;
3592
3593
INIT_LIST_HEAD(&tags->page_list);
3594
3595
/*
3596
* rq_size is the size of the request plus driver payload, rounded
3597
* to the cacheline size
3598
*/
3599
rq_size = round_up(sizeof(struct request) + set->cmd_size,
3600
cache_line_size());
3601
left = rq_size * depth;
3602
3603
for (i = 0; i < depth; ) {
3604
int this_order = max_order;
3605
struct page *page;
3606
int to_do;
3607
void *p;
3608
3609
while (this_order && left < order_to_size(this_order - 1))
3610
this_order--;
3611
3612
do {
3613
page = alloc_pages_node(node,
3614
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
3615
this_order);
3616
if (page)
3617
break;
3618
if (!this_order--)
3619
break;
3620
if (order_to_size(this_order) < rq_size)
3621
break;
3622
} while (1);
3623
3624
if (!page)
3625
goto fail;
3626
3627
page->private = this_order;
3628
list_add_tail(&page->lru, &tags->page_list);
3629
3630
p = page_address(page);
3631
/*
3632
* Allow kmemleak to scan these pages as they contain pointers
3633
* to additional allocations like via ops->init_request().
3634
*/
3635
kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
3636
entries_per_page = order_to_size(this_order) / rq_size;
3637
to_do = min(entries_per_page, depth - i);
3638
left -= to_do * rq_size;
3639
for (j = 0; j < to_do; j++) {
3640
struct request *rq = p;
3641
3642
tags->static_rqs[i] = rq;
3643
if (blk_mq_init_request(set, rq, hctx_idx, node)) {
3644
tags->static_rqs[i] = NULL;
3645
goto fail;
3646
}
3647
3648
p += rq_size;
3649
i++;
3650
}
3651
}
3652
return 0;
3653
3654
fail:
3655
blk_mq_free_rqs(set, tags, hctx_idx);
3656
return -ENOMEM;
3657
}
3658
3659
struct rq_iter_data {
3660
struct blk_mq_hw_ctx *hctx;
3661
bool has_rq;
3662
};
3663
3664
static bool blk_mq_has_request(struct request *rq, void *data)
3665
{
3666
struct rq_iter_data *iter_data = data;
3667
3668
if (rq->mq_hctx != iter_data->hctx)
3669
return true;
3670
iter_data->has_rq = true;
3671
return false;
3672
}
3673
3674
static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
3675
{
3676
struct blk_mq_tags *tags = hctx->sched_tags ?
3677
hctx->sched_tags : hctx->tags;
3678
struct rq_iter_data data = {
3679
.hctx = hctx,
3680
};
3681
3682
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
3683
return data.has_rq;
3684
}
3685
3686
static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
3687
unsigned int this_cpu)
3688
{
3689
enum hctx_type type = hctx->type;
3690
int cpu;
3691
3692
/*
3693
* hctx->cpumask has to rule out isolated CPUs, but userspace still
3694
* might submit IOs on these isolated CPUs, so use the queue map to
3695
* check if all CPUs mapped to this hctx are offline
3696
*/
3697
for_each_online_cpu(cpu) {
3698
struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
3699
type, cpu);
3700
3701
if (h != hctx)
3702
continue;
3703
3704
/* this hctx has at least one online CPU */
3705
if (this_cpu != cpu)
3706
return true;
3707
}
3708
3709
return false;
3710
}
3711
3712
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
3713
{
3714
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3715
struct blk_mq_hw_ctx, cpuhp_online);
3716
3717
if (blk_mq_hctx_has_online_cpu(hctx, cpu))
3718
return 0;
3719
3720
/*
3721
* Prevent new request from being allocated on the current hctx.
3722
*
3723
* The smp_mb__after_atomic() Pairs with the implied barrier in
3724
* test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
3725
* seen once we return from the tag allocator.
3726
*/
3727
set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3728
smp_mb__after_atomic();
3729
3730
/*
3731
* Try to grab a reference to the queue and wait for any outstanding
3732
* requests. If we could not grab a reference the queue has been
3733
* frozen and there are no requests.
3734
*/
3735
if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
3736
while (blk_mq_hctx_has_requests(hctx))
3737
msleep(5);
3738
percpu_ref_put(&hctx->queue->q_usage_counter);
3739
}
3740
3741
return 0;
3742
}
3743
3744
/*
3745
* Check if one CPU is mapped to the specified hctx
3746
*
3747
* Isolated CPUs have been ruled out from hctx->cpumask, which is supposed
3748
* to be used for scheduling kworker only. For other usage, please call this
3749
* helper for checking if one CPU belongs to the specified hctx
3750
*/
3751
static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu,
3752
const struct blk_mq_hw_ctx *hctx)
3753
{
3754
struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
3755
hctx->type, cpu);
3756
3757
return mapped_hctx == hctx;
3758
}
3759
3760
static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
3761
{
3762
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3763
struct blk_mq_hw_ctx, cpuhp_online);
3764
3765
if (blk_mq_cpu_mapped_to_hctx(cpu, hctx))
3766
clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3767
return 0;
3768
}
3769
3770
/*
3771
* 'cpu' is going away. splice any existing rq_list entries from this
3772
* software queue to the hw queue dispatch list, and ensure that it
3773
* gets run.
3774
*/
3775
static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
3776
{
3777
struct blk_mq_hw_ctx *hctx;
3778
struct blk_mq_ctx *ctx;
3779
LIST_HEAD(tmp);
3780
enum hctx_type type;
3781
3782
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
3783
if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx))
3784
return 0;
3785
3786
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
3787
type = hctx->type;
3788
3789
spin_lock(&ctx->lock);
3790
if (!list_empty(&ctx->rq_lists[type])) {
3791
list_splice_init(&ctx->rq_lists[type], &tmp);
3792
blk_mq_hctx_clear_pending(hctx, ctx);
3793
}
3794
spin_unlock(&ctx->lock);
3795
3796
if (list_empty(&tmp))
3797
return 0;
3798
3799
spin_lock(&hctx->lock);
3800
list_splice_tail_init(&tmp, &hctx->dispatch);
3801
spin_unlock(&hctx->lock);
3802
3803
blk_mq_run_hw_queue(hctx, true);
3804
return 0;
3805
}
3806
3807
static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
3808
{
3809
lockdep_assert_held(&blk_mq_cpuhp_lock);
3810
3811
if (!(hctx->flags & BLK_MQ_F_STACKING) &&
3812
!hlist_unhashed(&hctx->cpuhp_online)) {
3813
cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3814
&hctx->cpuhp_online);
3815
INIT_HLIST_NODE(&hctx->cpuhp_online);
3816
}
3817
3818
if (!hlist_unhashed(&hctx->cpuhp_dead)) {
3819
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
3820
&hctx->cpuhp_dead);
3821
INIT_HLIST_NODE(&hctx->cpuhp_dead);
3822
}
3823
}
3824
3825
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
3826
{
3827
mutex_lock(&blk_mq_cpuhp_lock);
3828
__blk_mq_remove_cpuhp(hctx);
3829
mutex_unlock(&blk_mq_cpuhp_lock);
3830
}
3831
3832
static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx)
3833
{
3834
lockdep_assert_held(&blk_mq_cpuhp_lock);
3835
3836
if (!(hctx->flags & BLK_MQ_F_STACKING) &&
3837
hlist_unhashed(&hctx->cpuhp_online))
3838
cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3839
&hctx->cpuhp_online);
3840
3841
if (hlist_unhashed(&hctx->cpuhp_dead))
3842
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD,
3843
&hctx->cpuhp_dead);
3844
}
3845
3846
static void __blk_mq_remove_cpuhp_list(struct list_head *head)
3847
{
3848
struct blk_mq_hw_ctx *hctx;
3849
3850
lockdep_assert_held(&blk_mq_cpuhp_lock);
3851
3852
list_for_each_entry(hctx, head, hctx_list)
3853
__blk_mq_remove_cpuhp(hctx);
3854
}
3855
3856
/*
3857
* Unregister cpuhp callbacks from exited hw queues
3858
*
3859
* Safe to call if this `request_queue` is live
3860
*/
3861
static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q)
3862
{
3863
LIST_HEAD(hctx_list);
3864
3865
spin_lock(&q->unused_hctx_lock);
3866
list_splice_init(&q->unused_hctx_list, &hctx_list);
3867
spin_unlock(&q->unused_hctx_lock);
3868
3869
mutex_lock(&blk_mq_cpuhp_lock);
3870
__blk_mq_remove_cpuhp_list(&hctx_list);
3871
mutex_unlock(&blk_mq_cpuhp_lock);
3872
3873
spin_lock(&q->unused_hctx_lock);
3874
list_splice(&hctx_list, &q->unused_hctx_list);
3875
spin_unlock(&q->unused_hctx_lock);
3876
}
3877
3878
/*
3879
* Register cpuhp callbacks from all hw queues
3880
*
3881
* Safe to call if this `request_queue` is live
3882
*/
3883
static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q)
3884
{
3885
struct blk_mq_hw_ctx *hctx;
3886
unsigned long i;
3887
3888
mutex_lock(&blk_mq_cpuhp_lock);
3889
queue_for_each_hw_ctx(q, hctx, i)
3890
__blk_mq_add_cpuhp(hctx);
3891
mutex_unlock(&blk_mq_cpuhp_lock);
3892
}
3893
3894
/*
3895
* Before freeing hw queue, clearing the flush request reference in
3896
* tags->rqs[] for avoiding potential UAF.
3897
*/
3898
static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
3899
unsigned int queue_depth, struct request *flush_rq)
3900
{
3901
int i;
3902
unsigned long flags;
3903
3904
/* The hw queue may not be mapped yet */
3905
if (!tags)
3906
return;
3907
3908
WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
3909
3910
for (i = 0; i < queue_depth; i++)
3911
cmpxchg(&tags->rqs[i], flush_rq, NULL);
3912
3913
/*
3914
* Wait until all pending iteration is done.
3915
*
3916
* Request reference is cleared and it is guaranteed to be observed
3917
* after the ->lock is released.
3918
*/
3919
spin_lock_irqsave(&tags->lock, flags);
3920
spin_unlock_irqrestore(&tags->lock, flags);
3921
}
3922
3923
/* hctx->ctxs will be freed in queue's release handler */
3924
static void blk_mq_exit_hctx(struct request_queue *q,
3925
struct blk_mq_tag_set *set,
3926
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
3927
{
3928
struct request *flush_rq = hctx->fq->flush_rq;
3929
3930
if (blk_mq_hw_queue_mapped(hctx))
3931
blk_mq_tag_idle(hctx);
3932
3933
if (blk_queue_init_done(q))
3934
blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
3935
set->queue_depth, flush_rq);
3936
if (set->ops->exit_request)
3937
set->ops->exit_request(set, flush_rq, hctx_idx);
3938
3939
if (set->ops->exit_hctx)
3940
set->ops->exit_hctx(hctx, hctx_idx);
3941
3942
xa_erase(&q->hctx_table, hctx_idx);
3943
3944
spin_lock(&q->unused_hctx_lock);
3945
list_add(&hctx->hctx_list, &q->unused_hctx_list);
3946
spin_unlock(&q->unused_hctx_lock);
3947
}
3948
3949
static void blk_mq_exit_hw_queues(struct request_queue *q,
3950
struct blk_mq_tag_set *set, int nr_queue)
3951
{
3952
struct blk_mq_hw_ctx *hctx;
3953
unsigned long i;
3954
3955
queue_for_each_hw_ctx(q, hctx, i) {
3956
if (i == nr_queue)
3957
break;
3958
blk_mq_remove_cpuhp(hctx);
3959
blk_mq_exit_hctx(q, set, hctx, i);
3960
}
3961
}
3962
3963
static int blk_mq_init_hctx(struct request_queue *q,
3964
struct blk_mq_tag_set *set,
3965
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
3966
{
3967
hctx->queue_num = hctx_idx;
3968
3969
hctx->tags = set->tags[hctx_idx];
3970
3971
if (set->ops->init_hctx &&
3972
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
3973
goto fail;
3974
3975
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
3976
hctx->numa_node))
3977
goto exit_hctx;
3978
3979
if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
3980
goto exit_flush_rq;
3981
3982
return 0;
3983
3984
exit_flush_rq:
3985
if (set->ops->exit_request)
3986
set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
3987
exit_hctx:
3988
if (set->ops->exit_hctx)
3989
set->ops->exit_hctx(hctx, hctx_idx);
3990
fail:
3991
return -1;
3992
}
3993
3994
static struct blk_mq_hw_ctx *
3995
blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
3996
int node)
3997
{
3998
struct blk_mq_hw_ctx *hctx;
3999
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
4000
4001
hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
4002
if (!hctx)
4003
goto fail_alloc_hctx;
4004
4005
if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
4006
goto free_hctx;
4007
4008
atomic_set(&hctx->nr_active, 0);
4009
if (node == NUMA_NO_NODE)
4010
node = set->numa_node;
4011
hctx->numa_node = node;
4012
4013
INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
4014
spin_lock_init(&hctx->lock);
4015
INIT_LIST_HEAD(&hctx->dispatch);
4016
INIT_HLIST_NODE(&hctx->cpuhp_dead);
4017
INIT_HLIST_NODE(&hctx->cpuhp_online);
4018
hctx->queue = q;
4019
hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
4020
4021
INIT_LIST_HEAD(&hctx->hctx_list);
4022
4023
/*
4024
* Allocate space for all possible cpus to avoid allocation at
4025
* runtime
4026
*/
4027
hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
4028
gfp, node);
4029
if (!hctx->ctxs)
4030
goto free_cpumask;
4031
4032
if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
4033
gfp, node, false, false))
4034
goto free_ctxs;
4035
hctx->nr_ctx = 0;
4036
4037
spin_lock_init(&hctx->dispatch_wait_lock);
4038
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
4039
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
4040
4041
hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
4042
if (!hctx->fq)
4043
goto free_bitmap;
4044
4045
blk_mq_hctx_kobj_init(hctx);
4046
4047
return hctx;
4048
4049
free_bitmap:
4050
sbitmap_free(&hctx->ctx_map);
4051
free_ctxs:
4052
kfree(hctx->ctxs);
4053
free_cpumask:
4054
free_cpumask_var(hctx->cpumask);
4055
free_hctx:
4056
kfree(hctx);
4057
fail_alloc_hctx:
4058
return NULL;
4059
}
4060
4061
static void blk_mq_init_cpu_queues(struct request_queue *q,
4062
unsigned int nr_hw_queues)
4063
{
4064
struct blk_mq_tag_set *set = q->tag_set;
4065
unsigned int i, j;
4066
4067
for_each_possible_cpu(i) {
4068
struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
4069
struct blk_mq_hw_ctx *hctx;
4070
int k;
4071
4072
__ctx->cpu = i;
4073
spin_lock_init(&__ctx->lock);
4074
for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
4075
INIT_LIST_HEAD(&__ctx->rq_lists[k]);
4076
4077
__ctx->queue = q;
4078
4079
/*
4080
* Set local node, IFF we have more than one hw queue. If
4081
* not, we remain on the home node of the device
4082
*/
4083
for (j = 0; j < set->nr_maps; j++) {
4084
hctx = blk_mq_map_queue_type(q, j, i);
4085
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
4086
hctx->numa_node = cpu_to_node(i);
4087
}
4088
}
4089
}
4090
4091
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
4092
unsigned int hctx_idx,
4093
unsigned int depth)
4094
{
4095
struct blk_mq_tags *tags;
4096
int ret;
4097
4098
tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
4099
if (!tags)
4100
return NULL;
4101
4102
ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
4103
if (ret) {
4104
blk_mq_free_rq_map(tags);
4105
return NULL;
4106
}
4107
4108
return tags;
4109
}
4110
4111
static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
4112
int hctx_idx)
4113
{
4114
if (blk_mq_is_shared_tags(set->flags)) {
4115
set->tags[hctx_idx] = set->shared_tags;
4116
4117
return true;
4118
}
4119
4120
set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
4121
set->queue_depth);
4122
4123
return set->tags[hctx_idx];
4124
}
4125
4126
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
4127
struct blk_mq_tags *tags,
4128
unsigned int hctx_idx)
4129
{
4130
if (tags) {
4131
blk_mq_free_rqs(set, tags, hctx_idx);
4132
blk_mq_free_rq_map(tags);
4133
}
4134
}
4135
4136
static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
4137
unsigned int hctx_idx)
4138
{
4139
if (!blk_mq_is_shared_tags(set->flags))
4140
blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
4141
4142
set->tags[hctx_idx] = NULL;
4143
}
4144
4145
static void blk_mq_map_swqueue(struct request_queue *q)
4146
{
4147
unsigned int j, hctx_idx;
4148
unsigned long i;
4149
struct blk_mq_hw_ctx *hctx;
4150
struct blk_mq_ctx *ctx;
4151
struct blk_mq_tag_set *set = q->tag_set;
4152
4153
queue_for_each_hw_ctx(q, hctx, i) {
4154
cpumask_clear(hctx->cpumask);
4155
hctx->nr_ctx = 0;
4156
hctx->dispatch_from = NULL;
4157
}
4158
4159
/*
4160
* Map software to hardware queues.
4161
*
4162
* If the cpu isn't present, the cpu is mapped to first hctx.
4163
*/
4164
for_each_possible_cpu(i) {
4165
4166
ctx = per_cpu_ptr(q->queue_ctx, i);
4167
for (j = 0; j < set->nr_maps; j++) {
4168
if (!set->map[j].nr_queues) {
4169
ctx->hctxs[j] = blk_mq_map_queue_type(q,
4170
HCTX_TYPE_DEFAULT, i);
4171
continue;
4172
}
4173
hctx_idx = set->map[j].mq_map[i];
4174
/* unmapped hw queue can be remapped after CPU topo changed */
4175
if (!set->tags[hctx_idx] &&
4176
!__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
4177
/*
4178
* If tags initialization fail for some hctx,
4179
* that hctx won't be brought online. In this
4180
* case, remap the current ctx to hctx[0] which
4181
* is guaranteed to always have tags allocated
4182
*/
4183
set->map[j].mq_map[i] = 0;
4184
}
4185
4186
hctx = blk_mq_map_queue_type(q, j, i);
4187
ctx->hctxs[j] = hctx;
4188
/*
4189
* If the CPU is already set in the mask, then we've
4190
* mapped this one already. This can happen if
4191
* devices share queues across queue maps.
4192
*/
4193
if (cpumask_test_cpu(i, hctx->cpumask))
4194
continue;
4195
4196
cpumask_set_cpu(i, hctx->cpumask);
4197
hctx->type = j;
4198
ctx->index_hw[hctx->type] = hctx->nr_ctx;
4199
hctx->ctxs[hctx->nr_ctx++] = ctx;
4200
4201
/*
4202
* If the nr_ctx type overflows, we have exceeded the
4203
* amount of sw queues we can support.
4204
*/
4205
BUG_ON(!hctx->nr_ctx);
4206
}
4207
4208
for (; j < HCTX_MAX_TYPES; j++)
4209
ctx->hctxs[j] = blk_mq_map_queue_type(q,
4210
HCTX_TYPE_DEFAULT, i);
4211
}
4212
4213
queue_for_each_hw_ctx(q, hctx, i) {
4214
int cpu;
4215
4216
/*
4217
* If no software queues are mapped to this hardware queue,
4218
* disable it and free the request entries.
4219
*/
4220
if (!hctx->nr_ctx) {
4221
/* Never unmap queue 0. We need it as a
4222
* fallback in case of a new remap fails
4223
* allocation
4224
*/
4225
if (i)
4226
__blk_mq_free_map_and_rqs(set, i);
4227
4228
hctx->tags = NULL;
4229
continue;
4230
}
4231
4232
hctx->tags = set->tags[i];
4233
WARN_ON(!hctx->tags);
4234
4235
/*
4236
* Set the map size to the number of mapped software queues.
4237
* This is more accurate and more efficient than looping
4238
* over all possibly mapped software queues.
4239
*/
4240
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
4241
4242
/*
4243
* Rule out isolated CPUs from hctx->cpumask to avoid
4244
* running block kworker on isolated CPUs
4245
*/
4246
for_each_cpu(cpu, hctx->cpumask) {
4247
if (cpu_is_isolated(cpu))
4248
cpumask_clear_cpu(cpu, hctx->cpumask);
4249
}
4250
4251
/*
4252
* Initialize batch roundrobin counts
4253
*/
4254
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
4255
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
4256
}
4257
}
4258
4259
/*
4260
* Caller needs to ensure that we're either frozen/quiesced, or that
4261
* the queue isn't live yet.
4262
*/
4263
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
4264
{
4265
struct blk_mq_hw_ctx *hctx;
4266
unsigned long i;
4267
4268
queue_for_each_hw_ctx(q, hctx, i) {
4269
if (shared) {
4270
hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
4271
} else {
4272
blk_mq_tag_idle(hctx);
4273
hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
4274
}
4275
}
4276
}
4277
4278
static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
4279
bool shared)
4280
{
4281
struct request_queue *q;
4282
unsigned int memflags;
4283
4284
lockdep_assert_held(&set->tag_list_lock);
4285
4286
list_for_each_entry(q, &set->tag_list, tag_set_list) {
4287
memflags = blk_mq_freeze_queue(q);
4288
queue_set_hctx_shared(q, shared);
4289
blk_mq_unfreeze_queue(q, memflags);
4290
}
4291
}
4292
4293
static void blk_mq_del_queue_tag_set(struct request_queue *q)
4294
{
4295
struct blk_mq_tag_set *set = q->tag_set;
4296
4297
mutex_lock(&set->tag_list_lock);
4298
list_del(&q->tag_set_list);
4299
if (list_is_singular(&set->tag_list)) {
4300
/* just transitioned to unshared */
4301
set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
4302
/* update existing queue */
4303
blk_mq_update_tag_set_shared(set, false);
4304
}
4305
mutex_unlock(&set->tag_list_lock);
4306
INIT_LIST_HEAD(&q->tag_set_list);
4307
}
4308
4309
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
4310
struct request_queue *q)
4311
{
4312
mutex_lock(&set->tag_list_lock);
4313
4314
/*
4315
* Check to see if we're transitioning to shared (from 1 to 2 queues).
4316
*/
4317
if (!list_empty(&set->tag_list) &&
4318
!(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
4319
set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
4320
/* update existing queue */
4321
blk_mq_update_tag_set_shared(set, true);
4322
}
4323
if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
4324
queue_set_hctx_shared(q, true);
4325
list_add_tail(&q->tag_set_list, &set->tag_list);
4326
4327
mutex_unlock(&set->tag_list_lock);
4328
}
4329
4330
/* All allocations will be freed in release handler of q->mq_kobj */
4331
static int blk_mq_alloc_ctxs(struct request_queue *q)
4332
{
4333
struct blk_mq_ctxs *ctxs;
4334
int cpu;
4335
4336
ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
4337
if (!ctxs)
4338
return -ENOMEM;
4339
4340
ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
4341
if (!ctxs->queue_ctx)
4342
goto fail;
4343
4344
for_each_possible_cpu(cpu) {
4345
struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
4346
ctx->ctxs = ctxs;
4347
}
4348
4349
q->mq_kobj = &ctxs->kobj;
4350
q->queue_ctx = ctxs->queue_ctx;
4351
4352
return 0;
4353
fail:
4354
kfree(ctxs);
4355
return -ENOMEM;
4356
}
4357
4358
/*
4359
* It is the actual release handler for mq, but we do it from
4360
* request queue's release handler for avoiding use-after-free
4361
* and headache because q->mq_kobj shouldn't have been introduced,
4362
* but we can't group ctx/kctx kobj without it.
4363
*/
4364
void blk_mq_release(struct request_queue *q)
4365
{
4366
struct blk_mq_hw_ctx *hctx, *next;
4367
unsigned long i;
4368
4369
queue_for_each_hw_ctx(q, hctx, i)
4370
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
4371
4372
/* all hctx are in .unused_hctx_list now */
4373
list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
4374
list_del_init(&hctx->hctx_list);
4375
kobject_put(&hctx->kobj);
4376
}
4377
4378
xa_destroy(&q->hctx_table);
4379
4380
/*
4381
* release .mq_kobj and sw queue's kobject now because
4382
* both share lifetime with request queue.
4383
*/
4384
blk_mq_sysfs_deinit(q);
4385
}
4386
4387
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
4388
struct queue_limits *lim, void *queuedata)
4389
{
4390
struct queue_limits default_lim = { };
4391
struct request_queue *q;
4392
int ret;
4393
4394
if (!lim)
4395
lim = &default_lim;
4396
lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
4397
if (set->nr_maps > HCTX_TYPE_POLL)
4398
lim->features |= BLK_FEAT_POLL;
4399
4400
q = blk_alloc_queue(lim, set->numa_node);
4401
if (IS_ERR(q))
4402
return q;
4403
q->queuedata = queuedata;
4404
ret = blk_mq_init_allocated_queue(set, q);
4405
if (ret) {
4406
blk_put_queue(q);
4407
return ERR_PTR(ret);
4408
}
4409
return q;
4410
}
4411
EXPORT_SYMBOL(blk_mq_alloc_queue);
4412
4413
/**
4414
* blk_mq_destroy_queue - shutdown a request queue
4415
* @q: request queue to shutdown
4416
*
4417
* This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
4418
* requests will be failed with -ENODEV. The caller is responsible for dropping
4419
* the reference from blk_mq_alloc_queue() by calling blk_put_queue().
4420
*
4421
* Context: can sleep
4422
*/
4423
void blk_mq_destroy_queue(struct request_queue *q)
4424
{
4425
WARN_ON_ONCE(!queue_is_mq(q));
4426
WARN_ON_ONCE(blk_queue_registered(q));
4427
4428
might_sleep();
4429
4430
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
4431
blk_queue_start_drain(q);
4432
blk_mq_freeze_queue_wait(q);
4433
4434
blk_sync_queue(q);
4435
blk_mq_cancel_work_sync(q);
4436
blk_mq_exit_queue(q);
4437
}
4438
EXPORT_SYMBOL(blk_mq_destroy_queue);
4439
4440
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
4441
struct queue_limits *lim, void *queuedata,
4442
struct lock_class_key *lkclass)
4443
{
4444
struct request_queue *q;
4445
struct gendisk *disk;
4446
4447
q = blk_mq_alloc_queue(set, lim, queuedata);
4448
if (IS_ERR(q))
4449
return ERR_CAST(q);
4450
4451
disk = __alloc_disk_node(q, set->numa_node, lkclass);
4452
if (!disk) {
4453
blk_mq_destroy_queue(q);
4454
blk_put_queue(q);
4455
return ERR_PTR(-ENOMEM);
4456
}
4457
set_bit(GD_OWNS_QUEUE, &disk->state);
4458
return disk;
4459
}
4460
EXPORT_SYMBOL(__blk_mq_alloc_disk);
4461
4462
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
4463
struct lock_class_key *lkclass)
4464
{
4465
struct gendisk *disk;
4466
4467
if (!blk_get_queue(q))
4468
return NULL;
4469
disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
4470
if (!disk)
4471
blk_put_queue(q);
4472
return disk;
4473
}
4474
EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
4475
4476
/*
4477
* Only hctx removed from cpuhp list can be reused
4478
*/
4479
static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx)
4480
{
4481
return hlist_unhashed(&hctx->cpuhp_online) &&
4482
hlist_unhashed(&hctx->cpuhp_dead);
4483
}
4484
4485
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
4486
struct blk_mq_tag_set *set, struct request_queue *q,
4487
int hctx_idx, int node)
4488
{
4489
struct blk_mq_hw_ctx *hctx = NULL, *tmp;
4490
4491
/* reuse dead hctx first */
4492
spin_lock(&q->unused_hctx_lock);
4493
list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
4494
if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) {
4495
hctx = tmp;
4496
break;
4497
}
4498
}
4499
if (hctx)
4500
list_del_init(&hctx->hctx_list);
4501
spin_unlock(&q->unused_hctx_lock);
4502
4503
if (!hctx)
4504
hctx = blk_mq_alloc_hctx(q, set, node);
4505
if (!hctx)
4506
goto fail;
4507
4508
if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
4509
goto free_hctx;
4510
4511
return hctx;
4512
4513
free_hctx:
4514
kobject_put(&hctx->kobj);
4515
fail:
4516
return NULL;
4517
}
4518
4519
static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
4520
struct request_queue *q)
4521
{
4522
struct blk_mq_hw_ctx *hctx;
4523
unsigned long i, j;
4524
4525
for (i = 0; i < set->nr_hw_queues; i++) {
4526
int old_node;
4527
int node = blk_mq_get_hctx_node(set, i);
4528
struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
4529
4530
if (old_hctx) {
4531
old_node = old_hctx->numa_node;
4532
blk_mq_exit_hctx(q, set, old_hctx, i);
4533
}
4534
4535
if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
4536
if (!old_hctx)
4537
break;
4538
pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
4539
node, old_node);
4540
hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
4541
WARN_ON_ONCE(!hctx);
4542
}
4543
}
4544
/*
4545
* Increasing nr_hw_queues fails. Free the newly allocated
4546
* hctxs and keep the previous q->nr_hw_queues.
4547
*/
4548
if (i != set->nr_hw_queues) {
4549
j = q->nr_hw_queues;
4550
} else {
4551
j = i;
4552
q->nr_hw_queues = set->nr_hw_queues;
4553
}
4554
4555
xa_for_each_start(&q->hctx_table, j, hctx, j)
4556
blk_mq_exit_hctx(q, set, hctx, j);
4557
}
4558
4559
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
4560
struct request_queue *q)
4561
{
4562
__blk_mq_realloc_hw_ctxs(set, q);
4563
4564
/* unregister cpuhp callbacks for exited hctxs */
4565
blk_mq_remove_hw_queues_cpuhp(q);
4566
4567
/* register cpuhp for new initialized hctxs */
4568
blk_mq_add_hw_queues_cpuhp(q);
4569
}
4570
4571
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
4572
struct request_queue *q)
4573
{
4574
/* mark the queue as mq asap */
4575
q->mq_ops = set->ops;
4576
4577
/*
4578
* ->tag_set has to be setup before initialize hctx, which cpuphp
4579
* handler needs it for checking queue mapping
4580
*/
4581
q->tag_set = set;
4582
4583
if (blk_mq_alloc_ctxs(q))
4584
goto err_exit;
4585
4586
/* init q->mq_kobj and sw queues' kobjects */
4587
blk_mq_sysfs_init(q);
4588
4589
INIT_LIST_HEAD(&q->unused_hctx_list);
4590
spin_lock_init(&q->unused_hctx_lock);
4591
4592
xa_init(&q->hctx_table);
4593
4594
blk_mq_realloc_hw_ctxs(set, q);
4595
if (!q->nr_hw_queues)
4596
goto err_hctxs;
4597
4598
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
4599
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
4600
4601
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
4602
4603
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
4604
INIT_LIST_HEAD(&q->flush_list);
4605
INIT_LIST_HEAD(&q->requeue_list);
4606
spin_lock_init(&q->requeue_lock);
4607
4608
q->nr_requests = set->queue_depth;
4609
4610
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
4611
blk_mq_map_swqueue(q);
4612
blk_mq_add_queue_tag_set(set, q);
4613
return 0;
4614
4615
err_hctxs:
4616
blk_mq_release(q);
4617
err_exit:
4618
q->mq_ops = NULL;
4619
return -ENOMEM;
4620
}
4621
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
4622
4623
/* tags can _not_ be used after returning from blk_mq_exit_queue */
4624
void blk_mq_exit_queue(struct request_queue *q)
4625
{
4626
struct blk_mq_tag_set *set = q->tag_set;
4627
4628
/* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
4629
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
4630
/* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
4631
blk_mq_del_queue_tag_set(q);
4632
}
4633
4634
static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
4635
{
4636
int i;
4637
4638
if (blk_mq_is_shared_tags(set->flags)) {
4639
set->shared_tags = blk_mq_alloc_map_and_rqs(set,
4640
BLK_MQ_NO_HCTX_IDX,
4641
set->queue_depth);
4642
if (!set->shared_tags)
4643
return -ENOMEM;
4644
}
4645
4646
for (i = 0; i < set->nr_hw_queues; i++) {
4647
if (!__blk_mq_alloc_map_and_rqs(set, i))
4648
goto out_unwind;
4649
cond_resched();
4650
}
4651
4652
return 0;
4653
4654
out_unwind:
4655
while (--i >= 0)
4656
__blk_mq_free_map_and_rqs(set, i);
4657
4658
if (blk_mq_is_shared_tags(set->flags)) {
4659
blk_mq_free_map_and_rqs(set, set->shared_tags,
4660
BLK_MQ_NO_HCTX_IDX);
4661
}
4662
4663
return -ENOMEM;
4664
}
4665
4666
/*
4667
* Allocate the request maps associated with this tag_set. Note that this
4668
* may reduce the depth asked for, if memory is tight. set->queue_depth
4669
* will be updated to reflect the allocated depth.
4670
*/
4671
static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
4672
{
4673
unsigned int depth;
4674
int err;
4675
4676
depth = set->queue_depth;
4677
do {
4678
err = __blk_mq_alloc_rq_maps(set);
4679
if (!err)
4680
break;
4681
4682
set->queue_depth >>= 1;
4683
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
4684
err = -ENOMEM;
4685
break;
4686
}
4687
} while (set->queue_depth);
4688
4689
if (!set->queue_depth || err) {
4690
pr_err("blk-mq: failed to allocate request map\n");
4691
return -ENOMEM;
4692
}
4693
4694
if (depth != set->queue_depth)
4695
pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
4696
depth, set->queue_depth);
4697
4698
return 0;
4699
}
4700
4701
static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
4702
{
4703
/*
4704
* blk_mq_map_queues() and multiple .map_queues() implementations
4705
* expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
4706
* number of hardware queues.
4707
*/
4708
if (set->nr_maps == 1)
4709
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
4710
4711
if (set->ops->map_queues) {
4712
int i;
4713
4714
/*
4715
* transport .map_queues is usually done in the following
4716
* way:
4717
*
4718
* for (queue = 0; queue < set->nr_hw_queues; queue++) {
4719
* mask = get_cpu_mask(queue)
4720
* for_each_cpu(cpu, mask)
4721
* set->map[x].mq_map[cpu] = queue;
4722
* }
4723
*
4724
* When we need to remap, the table has to be cleared for
4725
* killing stale mapping since one CPU may not be mapped
4726
* to any hw queue.
4727
*/
4728
for (i = 0; i < set->nr_maps; i++)
4729
blk_mq_clear_mq_map(&set->map[i]);
4730
4731
set->ops->map_queues(set);
4732
} else {
4733
BUG_ON(set->nr_maps > 1);
4734
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
4735
}
4736
}
4737
4738
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
4739
int new_nr_hw_queues)
4740
{
4741
struct blk_mq_tags **new_tags;
4742
int i;
4743
4744
if (set->nr_hw_queues >= new_nr_hw_queues)
4745
goto done;
4746
4747
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
4748
GFP_KERNEL, set->numa_node);
4749
if (!new_tags)
4750
return -ENOMEM;
4751
4752
if (set->tags)
4753
memcpy(new_tags, set->tags, set->nr_hw_queues *
4754
sizeof(*set->tags));
4755
kfree(set->tags);
4756
set->tags = new_tags;
4757
4758
for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
4759
if (!__blk_mq_alloc_map_and_rqs(set, i)) {
4760
while (--i >= set->nr_hw_queues)
4761
__blk_mq_free_map_and_rqs(set, i);
4762
return -ENOMEM;
4763
}
4764
cond_resched();
4765
}
4766
4767
done:
4768
set->nr_hw_queues = new_nr_hw_queues;
4769
return 0;
4770
}
4771
4772
/*
4773
* Alloc a tag set to be associated with one or more request queues.
4774
* May fail with EINVAL for various error conditions. May adjust the
4775
* requested depth down, if it's too large. In that case, the set
4776
* value will be stored in set->queue_depth.
4777
*/
4778
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
4779
{
4780
int i, ret;
4781
4782
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
4783
4784
if (!set->nr_hw_queues)
4785
return -EINVAL;
4786
if (!set->queue_depth)
4787
return -EINVAL;
4788
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
4789
return -EINVAL;
4790
4791
if (!set->ops->queue_rq)
4792
return -EINVAL;
4793
4794
if (!set->ops->get_budget ^ !set->ops->put_budget)
4795
return -EINVAL;
4796
4797
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
4798
pr_info("blk-mq: reduced tag depth to %u\n",
4799
BLK_MQ_MAX_DEPTH);
4800
set->queue_depth = BLK_MQ_MAX_DEPTH;
4801
}
4802
4803
if (!set->nr_maps)
4804
set->nr_maps = 1;
4805
else if (set->nr_maps > HCTX_MAX_TYPES)
4806
return -EINVAL;
4807
4808
/*
4809
* If a crashdump is active, then we are potentially in a very
4810
* memory constrained environment. Limit us to 64 tags to prevent
4811
* using too much memory.
4812
*/
4813
if (is_kdump_kernel())
4814
set->queue_depth = min(64U, set->queue_depth);
4815
4816
/*
4817
* There is no use for more h/w queues than cpus if we just have
4818
* a single map
4819
*/
4820
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
4821
set->nr_hw_queues = nr_cpu_ids;
4822
4823
if (set->flags & BLK_MQ_F_BLOCKING) {
4824
set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
4825
if (!set->srcu)
4826
return -ENOMEM;
4827
ret = init_srcu_struct(set->srcu);
4828
if (ret)
4829
goto out_free_srcu;
4830
}
4831
4832
init_rwsem(&set->update_nr_hwq_lock);
4833
4834
ret = -ENOMEM;
4835
set->tags = kcalloc_node(set->nr_hw_queues,
4836
sizeof(struct blk_mq_tags *), GFP_KERNEL,
4837
set->numa_node);
4838
if (!set->tags)
4839
goto out_cleanup_srcu;
4840
4841
for (i = 0; i < set->nr_maps; i++) {
4842
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
4843
sizeof(set->map[i].mq_map[0]),
4844
GFP_KERNEL, set->numa_node);
4845
if (!set->map[i].mq_map)
4846
goto out_free_mq_map;
4847
set->map[i].nr_queues = set->nr_hw_queues;
4848
}
4849
4850
blk_mq_update_queue_map(set);
4851
4852
ret = blk_mq_alloc_set_map_and_rqs(set);
4853
if (ret)
4854
goto out_free_mq_map;
4855
4856
mutex_init(&set->tag_list_lock);
4857
INIT_LIST_HEAD(&set->tag_list);
4858
4859
return 0;
4860
4861
out_free_mq_map:
4862
for (i = 0; i < set->nr_maps; i++) {
4863
kfree(set->map[i].mq_map);
4864
set->map[i].mq_map = NULL;
4865
}
4866
kfree(set->tags);
4867
set->tags = NULL;
4868
out_cleanup_srcu:
4869
if (set->flags & BLK_MQ_F_BLOCKING)
4870
cleanup_srcu_struct(set->srcu);
4871
out_free_srcu:
4872
if (set->flags & BLK_MQ_F_BLOCKING)
4873
kfree(set->srcu);
4874
return ret;
4875
}
4876
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
4877
4878
/* allocate and initialize a tagset for a simple single-queue device */
4879
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
4880
const struct blk_mq_ops *ops, unsigned int queue_depth,
4881
unsigned int set_flags)
4882
{
4883
memset(set, 0, sizeof(*set));
4884
set->ops = ops;
4885
set->nr_hw_queues = 1;
4886
set->nr_maps = 1;
4887
set->queue_depth = queue_depth;
4888
set->numa_node = NUMA_NO_NODE;
4889
set->flags = set_flags;
4890
return blk_mq_alloc_tag_set(set);
4891
}
4892
EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
4893
4894
void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
4895
{
4896
int i, j;
4897
4898
for (i = 0; i < set->nr_hw_queues; i++)
4899
__blk_mq_free_map_and_rqs(set, i);
4900
4901
if (blk_mq_is_shared_tags(set->flags)) {
4902
blk_mq_free_map_and_rqs(set, set->shared_tags,
4903
BLK_MQ_NO_HCTX_IDX);
4904
}
4905
4906
for (j = 0; j < set->nr_maps; j++) {
4907
kfree(set->map[j].mq_map);
4908
set->map[j].mq_map = NULL;
4909
}
4910
4911
kfree(set->tags);
4912
set->tags = NULL;
4913
if (set->flags & BLK_MQ_F_BLOCKING) {
4914
cleanup_srcu_struct(set->srcu);
4915
kfree(set->srcu);
4916
}
4917
}
4918
EXPORT_SYMBOL(blk_mq_free_tag_set);
4919
4920
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
4921
{
4922
struct blk_mq_tag_set *set = q->tag_set;
4923
struct blk_mq_hw_ctx *hctx;
4924
int ret;
4925
unsigned long i;
4926
4927
if (WARN_ON_ONCE(!q->mq_freeze_depth))
4928
return -EINVAL;
4929
4930
if (!set)
4931
return -EINVAL;
4932
4933
if (q->nr_requests == nr)
4934
return 0;
4935
4936
blk_mq_quiesce_queue(q);
4937
4938
ret = 0;
4939
queue_for_each_hw_ctx(q, hctx, i) {
4940
if (!hctx->tags)
4941
continue;
4942
/*
4943
* If we're using an MQ scheduler, just update the scheduler
4944
* queue depth. This is similar to what the old code would do.
4945
*/
4946
if (hctx->sched_tags) {
4947
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
4948
nr, true);
4949
} else {
4950
ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
4951
false);
4952
}
4953
if (ret)
4954
break;
4955
if (q->elevator && q->elevator->type->ops.depth_updated)
4956
q->elevator->type->ops.depth_updated(hctx);
4957
}
4958
if (!ret) {
4959
q->nr_requests = nr;
4960
if (blk_mq_is_shared_tags(set->flags)) {
4961
if (q->elevator)
4962
blk_mq_tag_update_sched_shared_tags(q);
4963
else
4964
blk_mq_tag_resize_shared_tags(set, nr);
4965
}
4966
}
4967
4968
blk_mq_unquiesce_queue(q);
4969
4970
return ret;
4971
}
4972
4973
/*
4974
* Switch back to the elevator type stored in the xarray.
4975
*/
4976
static void blk_mq_elv_switch_back(struct request_queue *q,
4977
struct xarray *elv_tbl, struct xarray *et_tbl)
4978
{
4979
struct elevator_type *e = xa_load(elv_tbl, q->id);
4980
struct elevator_tags *t = xa_load(et_tbl, q->id);
4981
4982
/* The elv_update_nr_hw_queues unfreezes the queue. */
4983
elv_update_nr_hw_queues(q, e, t);
4984
4985
/* Drop the reference acquired in blk_mq_elv_switch_none. */
4986
if (e)
4987
elevator_put(e);
4988
}
4989
4990
/*
4991
* Stores elevator type in xarray and set current elevator to none. It uses
4992
* q->id as an index to store the elevator type into the xarray.
4993
*/
4994
static int blk_mq_elv_switch_none(struct request_queue *q,
4995
struct xarray *elv_tbl)
4996
{
4997
int ret = 0;
4998
4999
lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);
5000
5001
/*
5002
* Accessing q->elevator without holding q->elevator_lock is safe here
5003
* because we're called from nr_hw_queue update which is protected by
5004
* set->update_nr_hwq_lock in the writer context. So, scheduler update/
5005
* switch code (which acquires the same lock in the reader context)
5006
* can't run concurrently.
5007
*/
5008
if (q->elevator) {
5009
5010
ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL);
5011
if (WARN_ON_ONCE(ret))
5012
return ret;
5013
5014
/*
5015
* Before we switch elevator to 'none', take a reference to
5016
* the elevator module so that while nr_hw_queue update is
5017
* running, no one can remove elevator module. We'd put the
5018
* reference to elevator module later when we switch back
5019
* elevator.
5020
*/
5021
__elevator_get(q->elevator->type);
5022
5023
elevator_set_none(q);
5024
}
5025
return ret;
5026
}
5027
5028
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
5029
int nr_hw_queues)
5030
{
5031
struct request_queue *q;
5032
int prev_nr_hw_queues = set->nr_hw_queues;
5033
unsigned int memflags;
5034
int i;
5035
struct xarray elv_tbl, et_tbl;
5036
bool queues_frozen = false;
5037
5038
lockdep_assert_held(&set->tag_list_lock);
5039
5040
if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
5041
nr_hw_queues = nr_cpu_ids;
5042
if (nr_hw_queues < 1)
5043
return;
5044
if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
5045
return;
5046
5047
memflags = memalloc_noio_save();
5048
5049
xa_init(&et_tbl);
5050
if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
5051
goto out_memalloc_restore;
5052
5053
xa_init(&elv_tbl);
5054
5055
list_for_each_entry(q, &set->tag_list, tag_set_list) {
5056
blk_mq_debugfs_unregister_hctxs(q);
5057
blk_mq_sysfs_unregister_hctxs(q);
5058
}
5059
5060
/*
5061
* Switch IO scheduler to 'none', cleaning up the data associated
5062
* with the previous scheduler. We will switch back once we are done
5063
* updating the new sw to hw queue mappings.
5064
*/
5065
list_for_each_entry(q, &set->tag_list, tag_set_list)
5066
if (blk_mq_elv_switch_none(q, &elv_tbl))
5067
goto switch_back;
5068
5069
list_for_each_entry(q, &set->tag_list, tag_set_list)
5070
blk_mq_freeze_queue_nomemsave(q);
5071
queues_frozen = true;
5072
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
5073
goto switch_back;
5074
5075
fallback:
5076
blk_mq_update_queue_map(set);
5077
list_for_each_entry(q, &set->tag_list, tag_set_list) {
5078
__blk_mq_realloc_hw_ctxs(set, q);
5079
5080
if (q->nr_hw_queues != set->nr_hw_queues) {
5081
int i = prev_nr_hw_queues;
5082
5083
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
5084
nr_hw_queues, prev_nr_hw_queues);
5085
for (; i < set->nr_hw_queues; i++)
5086
__blk_mq_free_map_and_rqs(set, i);
5087
5088
set->nr_hw_queues = prev_nr_hw_queues;
5089
goto fallback;
5090
}
5091
blk_mq_map_swqueue(q);
5092
}
5093
switch_back:
5094
/* The blk_mq_elv_switch_back unfreezes queue for us. */
5095
list_for_each_entry(q, &set->tag_list, tag_set_list) {
5096
/* switch_back expects queue to be frozen */
5097
if (!queues_frozen)
5098
blk_mq_freeze_queue_nomemsave(q);
5099
blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
5100
}
5101
5102
list_for_each_entry(q, &set->tag_list, tag_set_list) {
5103
blk_mq_sysfs_register_hctxs(q);
5104
blk_mq_debugfs_register_hctxs(q);
5105
5106
blk_mq_remove_hw_queues_cpuhp(q);
5107
blk_mq_add_hw_queues_cpuhp(q);
5108
}
5109
5110
xa_destroy(&elv_tbl);
5111
xa_destroy(&et_tbl);
5112
out_memalloc_restore:
5113
memalloc_noio_restore(memflags);
5114
5115
/* Free the excess tags when nr_hw_queues shrink. */
5116
for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
5117
__blk_mq_free_map_and_rqs(set, i);
5118
}
5119
5120
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
5121
{
5122
down_write(&set->update_nr_hwq_lock);
5123
mutex_lock(&set->tag_list_lock);
5124
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
5125
mutex_unlock(&set->tag_list_lock);
5126
up_write(&set->update_nr_hwq_lock);
5127
}
5128
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
5129
5130
static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
5131
struct io_comp_batch *iob, unsigned int flags)
5132
{
5133
long state = get_current_state();
5134
int ret;
5135
5136
do {
5137
ret = q->mq_ops->poll(hctx, iob);
5138
if (ret > 0) {
5139
__set_current_state(TASK_RUNNING);
5140
return ret;
5141
}
5142
5143
if (signal_pending_state(state, current))
5144
__set_current_state(TASK_RUNNING);
5145
if (task_is_running(current))
5146
return 1;
5147
5148
if (ret < 0 || (flags & BLK_POLL_ONESHOT))
5149
break;
5150
cpu_relax();
5151
} while (!need_resched());
5152
5153
__set_current_state(TASK_RUNNING);
5154
return 0;
5155
}
5156
5157
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
5158
struct io_comp_batch *iob, unsigned int flags)
5159
{
5160
if (!blk_mq_can_poll(q))
5161
return 0;
5162
return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
5163
}
5164
5165
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
5166
unsigned int poll_flags)
5167
{
5168
struct request_queue *q = rq->q;
5169
int ret;
5170
5171
if (!blk_rq_is_poll(rq))
5172
return 0;
5173
if (!percpu_ref_tryget(&q->q_usage_counter))
5174
return 0;
5175
5176
ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags);
5177
blk_queue_exit(q);
5178
5179
return ret;
5180
}
5181
EXPORT_SYMBOL_GPL(blk_rq_poll);
5182
5183
unsigned int blk_mq_rq_cpu(struct request *rq)
5184
{
5185
return rq->mq_ctx->cpu;
5186
}
5187
EXPORT_SYMBOL(blk_mq_rq_cpu);
5188
5189
void blk_mq_cancel_work_sync(struct request_queue *q)
5190
{
5191
struct blk_mq_hw_ctx *hctx;
5192
unsigned long i;
5193
5194
cancel_delayed_work_sync(&q->requeue_work);
5195
5196
queue_for_each_hw_ctx(q, hctx, i)
5197
cancel_delayed_work_sync(&hctx->run_work);
5198
}
5199
5200
static int __init blk_mq_init(void)
5201
{
5202
int i;
5203
5204
for_each_possible_cpu(i)
5205
init_llist_head(&per_cpu(blk_cpu_done, i));
5206
for_each_possible_cpu(i)
5207
INIT_CSD(&per_cpu(blk_cpu_csd, i),
5208
__blk_mq_complete_request_remote, NULL);
5209
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
5210
5211
cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
5212
"block/softirq:dead", NULL,
5213
blk_softirq_cpu_dead);
5214
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
5215
blk_mq_hctx_notify_dead);
5216
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
5217
blk_mq_hctx_notify_online,
5218
blk_mq_hctx_notify_offline);
5219
return 0;
5220
}
5221
subsys_initcall(blk_mq_init);
5222
5223