CoCalc -- discard.c

GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/discard.c
⁴⁹⁶⁵⁹ views
1
// SPDX-License-Identifier: GPL-2.0
2

3
#include <linux/jiffies.h>
4
#include <linux/kernel.h>
5
#include <linux/ktime.h>
6
#include <linux/list.h>
7
#include <linux/math64.h>
8
#include <linux/sizes.h>
9
#include <linux/workqueue.h>
10
#include "ctree.h"
11
#include "block-group.h"
12
#include "discard.h"
13
#include "free-space-cache.h"
14
#include "fs.h"
15

16
/*
17
 * This contains the logic to handle async discard.
18
 *
19
 * Async discard manages trimming of free space outside of transaction commit.
20
 * Discarding is done by managing the block_groups on a LRU list based on free
21
 * space recency.  Two passes are used to first prioritize discarding extents
22
 * and then allow for trimming in the bitmap the best opportunity to coalesce.
23
 * The block_groups are maintained on multiple lists to allow for multiple
24
 * passes with different discard filter requirements.  A delayed work item is
25
 * used to manage discarding with timeout determined by a max of the delay
26
 * incurred by the iops rate limit, the byte rate limit, and the max delay of
27
 * BTRFS_DISCARD_MAX_DELAY.
28
 *
29
 * Note, this only keeps track of block_groups that are explicitly for data.
30
 * Mixed block_groups are not supported.
31
 *
32
 * The first list is special to manage discarding of fully free block groups.
33
 * This is necessary because we issue a final trim for a full free block group
34
 * after forgetting it.  When a block group becomes unused, instead of directly
35
 * being added to the unused_bgs list, we add it to this first list.  Then
36
 * from there, if it becomes fully discarded, we place it onto the unused_bgs
37
 * list.
38
 *
39
 * The in-memory free space cache serves as the backing state for discard.
40
 * Consequently this means there is no persistence.  We opt to load all the
41
 * block groups in as not discarded, so the mount case degenerates to the
42
 * crashing case.
43
 *
44
 * As the free space cache uses bitmaps, there exists a tradeoff between
45
 * ease/efficiency for find_free_extent() and the accuracy of discard state.
46
 * Here we opt to let untrimmed regions merge with everything while only letting
47
 * trimmed regions merge with other trimmed regions.  This can cause
48
 * overtrimming, but the coalescing benefit seems to be worth it.  Additionally,
49
 * bitmap state is tracked as a whole.  If we're able to fully trim a bitmap,
50
 * the trimmed flag is set on the bitmap.  Otherwise, if an allocation comes in,
51
 * this resets the state and we will retry trimming the whole bitmap.  This is a
52
 * tradeoff between discard state accuracy and the cost of accounting.
53
 */
54

55
/* This is an initial delay to give some chance for block reuse */
56
#define BTRFS_DISCARD_DELAY		(120ULL * NSEC_PER_SEC)
57
#define BTRFS_DISCARD_UNUSED_DELAY	(10ULL * NSEC_PER_SEC)
58

59
#define BTRFS_DISCARD_MIN_DELAY_MSEC	(1UL)
60
#define BTRFS_DISCARD_MAX_DELAY_MSEC	(1000UL)
61
#define BTRFS_DISCARD_MAX_IOPS		(1000U)
62

63
/* Monotonically decreasing minimum length filters after index 0 */
64
static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65
	0,
66
	BTRFS_ASYNC_DISCARD_MAX_FILTER,
67
	BTRFS_ASYNC_DISCARD_MIN_FILTER
68
};
69

70
static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71
					  const struct btrfs_block_group *block_group)
72
{
73
	return &discard_ctl->discard_list[block_group->discard_index];
74
}
75

76
/*
77
 * Determine if async discard should be running.
78
 *
79
 * @discard_ctl: discard control
80
 *
81
 * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82
 */
83
static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
84
{
85
	struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86
						     struct btrfs_fs_info,
87
						     discard_ctl);
88

89
	return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90
		test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91
}
92

93
static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94
				  struct btrfs_block_group *block_group)
95
{
96
	lockdep_assert_held(&discard_ctl->lock);
97

98
	if (list_empty(&block_group->discard_list) ||
99
	    block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
100
		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
101
			block_group->discard_index = BTRFS_DISCARD_INDEX_START;
102
		block_group->discard_eligible_time = (ktime_get_ns() +
103
						      BTRFS_DISCARD_DELAY);
104
		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
105
	}
106
	if (list_empty(&block_group->discard_list))
107
		btrfs_get_block_group(block_group);
108

109
	list_move_tail(&block_group->discard_list,
110
		       get_discard_list(discard_ctl, block_group));
111
}
112

113
static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
114
				struct btrfs_block_group *block_group)
115
{
116
	if (!btrfs_is_block_group_data_only(block_group))
117
		return;
118

119
	if (!btrfs_run_discard_work(discard_ctl))
120
		return;
121

122
	spin_lock(&discard_ctl->lock);
123
	__add_to_discard_list(discard_ctl, block_group);
124
	spin_unlock(&discard_ctl->lock);
125
}
126

127
static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
128
				       struct btrfs_block_group *block_group)
129
{
130
	bool queued;
131

132
	spin_lock(&discard_ctl->lock);
133

134
	queued = !list_empty(&block_group->discard_list);
135

136
	if (!btrfs_run_discard_work(discard_ctl)) {
137
		spin_unlock(&discard_ctl->lock);
138
		return;
139
	}
140

141
	list_del_init(&block_group->discard_list);
142

143
	block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
144
	block_group->discard_eligible_time = (ktime_get_ns() +
145
					      BTRFS_DISCARD_UNUSED_DELAY);
146
	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
147
	if (!queued)
148
		btrfs_get_block_group(block_group);
149
	list_add_tail(&block_group->discard_list,
150
		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
151

152
	spin_unlock(&discard_ctl->lock);
153
}
154

155
static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
156
				     struct btrfs_block_group *block_group)
157
{
158
	bool running = false;
159
	bool queued = false;
160

161
	spin_lock(&discard_ctl->lock);
162

163
	if (block_group == discard_ctl->block_group) {
164
		running = true;
165
		discard_ctl->block_group = NULL;
166
	}
167

168
	block_group->discard_eligible_time = 0;
169
	queued = !list_empty(&block_group->discard_list);
170
	list_del_init(&block_group->discard_list);
171
	if (queued)
172
		btrfs_put_block_group(block_group);
173

174
	spin_unlock(&discard_ctl->lock);
175

176
	return running;
177
}
178

179
/*
180
 * Find block_group that's up next for discarding.
181
 *
182
 * @discard_ctl:  discard control
183
 * @now:          current time
184
 *
185
 * Iterate over the discard lists to find the next block_group up for
186
 * discarding checking the discard_eligible_time of block_group.
187
 */
188
static struct btrfs_block_group *find_next_block_group(
189
					struct btrfs_discard_ctl *discard_ctl,
190
					u64 now)
191
{
192
	struct btrfs_block_group *ret_block_group = NULL, *block_group;
193
	int i;
194

195
	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
196
		struct list_head *discard_list = &discard_ctl->discard_list[i];
197

198
		if (!list_empty(discard_list)) {
199
			block_group = list_first_entry(discard_list,
200
						       struct btrfs_block_group,
201
						       discard_list);
202

203
			if (!ret_block_group)
204
				ret_block_group = block_group;
205

206
			if (ret_block_group->discard_eligible_time < now)
207
				break;
208

209
			if (ret_block_group->discard_eligible_time >
210
			    block_group->discard_eligible_time)
211
				ret_block_group = block_group;
212
		}
213
	}
214

215
	return ret_block_group;
216
}
217

218
/*
219
 * Look up next block group and set it for use.
220
 *
221
 * @discard_ctl:   discard control
222
 * @discard_state: the discard_state of the block_group after state management
223
 * @discard_index: the discard_index of the block_group after state management
224
 * @now:           time when discard was invoked, in ns
225
 *
226
 * Wrap find_next_block_group() and set the block_group to be in use.
227
 * @discard_state's control flow is managed here.  Variables related to
228
 * @discard_state are reset here as needed (eg. @discard_cursor).  @discard_state
229
 * and @discard_index are remembered as it may change while we're discarding,
230
 * but we want the discard to execute in the context determined here.
231
 */
232
static struct btrfs_block_group *peek_discard_list(
233
					struct btrfs_discard_ctl *discard_ctl,
234
					enum btrfs_discard_state *discard_state,
235
					int *discard_index, u64 now)
236
{
237
	struct btrfs_block_group *block_group;
238

239
	spin_lock(&discard_ctl->lock);
240
again:
241
	block_group = find_next_block_group(discard_ctl, now);
242

243
	if (block_group && now >= block_group->discard_eligible_time) {
244
		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
245
		    block_group->used != 0) {
246
			if (btrfs_is_block_group_data_only(block_group)) {
247
				__add_to_discard_list(discard_ctl, block_group);
248
				/*
249
				 * The block group must have been moved to other
250
				 * discard list even if discard was disabled in
251
				 * the meantime or a transaction abort happened,
252
				 * otherwise we can end up in an infinite loop,
253
				 * always jumping into the 'again' label and
254
				 * keep getting this block group over and over
255
				 * in case there are no other block groups in
256
				 * the discard lists.
257
				 */
258
				ASSERT(block_group->discard_index !=
259
				       BTRFS_DISCARD_INDEX_UNUSED,
260
				       "discard_index=%d",
261
				       block_group->discard_index);
262
			} else {
263
				list_del_init(&block_group->discard_list);
264
				btrfs_put_block_group(block_group);
265
			}
266
			goto again;
267
		}
268
		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
269
			block_group->discard_cursor = block_group->start;
270
			block_group->discard_state = BTRFS_DISCARD_EXTENTS;
271
		}
272
	}
273
	if (block_group) {
274
		btrfs_get_block_group(block_group);
275
		discard_ctl->block_group = block_group;
276
		*discard_state = block_group->discard_state;
277
		*discard_index = block_group->discard_index;
278
	}
279
	spin_unlock(&discard_ctl->lock);
280

281
	return block_group;
282
}
283

284
/*
285
 * Update a block group's filters.
286
 *
287
 * @block_group:  block group of interest
288
 * @bytes:        recently freed region size after coalescing
289
 *
290
 * Async discard maintains multiple lists with progressively smaller filters
291
 * to prioritize discarding based on size.  Should a free space that matches
292
 * a larger filter be returned to the free_space_cache, prioritize that discard
293
 * by moving @block_group to the proper filter.
294
 */
295
void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
296
				u64 bytes)
297
{
298
	struct btrfs_discard_ctl *discard_ctl;
299

300
	if (!block_group ||
301
	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
302
		return;
303

304
	discard_ctl = &block_group->fs_info->discard_ctl;
305

306
	if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
307
	    bytes >= discard_minlen[block_group->discard_index - 1]) {
308
		int i;
309

310
		remove_from_discard_list(discard_ctl, block_group);
311

312
		for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
313
		     i++) {
314
			if (bytes >= discard_minlen[i]) {
315
				block_group->discard_index = i;
316
				add_to_discard_list(discard_ctl, block_group);
317
				break;
318
			}
319
		}
320
	}
321
}
322

323
/*
324
 * Move a block group along the discard lists.
325
 *
326
 * @discard_ctl: discard control
327
 * @block_group: block_group of interest
328
 *
329
 * Increment @block_group's discard_index.  If it falls of the list, let it be.
330
 * Otherwise add it back to the appropriate list.
331
 */
332
static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
333
				       struct btrfs_block_group *block_group)
334
{
335
	block_group->discard_index++;
336
	if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
337
		block_group->discard_index = 1;
338
		return;
339
	}
340

341
	add_to_discard_list(discard_ctl, block_group);
342
}
343

344
/*
345
 * Remove a block_group from the discard lists.
346
 *
347
 * @discard_ctl: discard control
348
 * @block_group: block_group of interest
349
 *
350
 * Remove @block_group from the discard lists.  If necessary, wait on the
351
 * current work and then reschedule the delayed work.
352
 */
353
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
354
			       struct btrfs_block_group *block_group)
355
{
356
	if (remove_from_discard_list(discard_ctl, block_group)) {
357
		cancel_delayed_work_sync(&discard_ctl->work);
358
		btrfs_discard_schedule_work(discard_ctl, true);
359
	}
360
}
361

362
/*
363
 * Handles queuing the block_groups.
364
 *
365
 * @discard_ctl: discard control
366
 * @block_group: block_group of interest
367
 *
368
 * Maintain the LRU order of the discard lists.
369
 */
370
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
371
			      struct btrfs_block_group *block_group)
372
{
373
	if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
374
		return;
375

376
	if (block_group->used == 0)
377
		add_to_discard_unused_list(discard_ctl, block_group);
378
	else
379
		add_to_discard_list(discard_ctl, block_group);
380

381
	if (!delayed_work_pending(&discard_ctl->work))
382
		btrfs_discard_schedule_work(discard_ctl, false);
383
}
384

385
static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
386
					  u64 now, bool override)
387
{
388
	struct btrfs_block_group *block_group;
389

390
	if (!btrfs_run_discard_work(discard_ctl))
391
		return;
392
	if (!override && delayed_work_pending(&discard_ctl->work))
393
		return;
394

395
	block_group = find_next_block_group(discard_ctl, now);
396
	if (block_group) {
397
		u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
398
		u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
399

400
		/*
401
		 * A single delayed workqueue item is responsible for
402
		 * discarding, so we can manage the bytes rate limit by keeping
403
		 * track of the previous discard.
404
		 */
405
		if (kbps_limit && discard_ctl->prev_discard) {
406
			u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
407
			u64 bps_delay = div64_u64(discard_ctl->prev_discard *
408
						  NSEC_PER_SEC, bps_limit);
409

410
			delay = max(delay, bps_delay);
411
		}
412

413
		/*
414
		 * This timeout is to hopefully prevent immediate discarding
415
		 * in a recently allocated block group.
416
		 */
417
		if (now < block_group->discard_eligible_time) {
418
			u64 bg_timeout = block_group->discard_eligible_time - now;
419

420
			delay = max(delay, bg_timeout);
421
		}
422

423
		if (override && discard_ctl->prev_discard) {
424
			u64 elapsed = now - discard_ctl->prev_discard_time;
425

426
			if (delay > elapsed)
427
				delay -= elapsed;
428
			else
429
				delay = 0;
430
		}
431

432
		mod_delayed_work(discard_ctl->discard_workers,
433
				 &discard_ctl->work, nsecs_to_jiffies(delay));
434
	}
435
}
436

437
/*
438
 * Responsible for scheduling the discard work.
439
 *
440
 * @discard_ctl:  discard control
441
 * @override:     override the current timer
442
 *
443
 * Discards are issued by a delayed workqueue item.  @override is used to
444
 * update the current delay as the baseline delay interval is reevaluated on
445
 * transaction commit.  This is also maxed with any other rate limit.
446
 */
447
void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
448
				 bool override)
449
{
450
	const u64 now = ktime_get_ns();
451

452
	spin_lock(&discard_ctl->lock);
453
	__btrfs_discard_schedule_work(discard_ctl, now, override);
454
	spin_unlock(&discard_ctl->lock);
455
}
456

457
/*
458
 * Determine next step of a block_group.
459
 *
460
 * @discard_ctl: discard control
461
 * @block_group: block_group of interest
462
 *
463
 * Determine the next step for a block group after it's finished going through
464
 * a pass on a discard list.  If it is unused and fully trimmed, we can mark it
465
 * unused and send it to the unused_bgs path.  Otherwise, pass it onto the
466
 * appropriate filter list or let it fall off.
467
 */
468
static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
469
				      struct btrfs_block_group *block_group)
470
{
471
	remove_from_discard_list(discard_ctl, block_group);
472

473
	if (block_group->used == 0) {
474
		if (btrfs_is_free_space_trimmed(block_group))
475
			btrfs_mark_bg_unused(block_group);
476
		else
477
			add_to_discard_unused_list(discard_ctl, block_group);
478
	} else {
479
		btrfs_update_discard_index(discard_ctl, block_group);
480
	}
481
}
482

483
/*
484
 * Discard work queue callback
485
 *
486
 * @work: work
487
 *
488
 * Find the next block_group to start discarding and then discard a single
489
 * region.  It does this in a two-pass fashion: first extents and second
490
 * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
491
 */
492
static void btrfs_discard_workfn(struct work_struct *work)
493
{
494
	struct btrfs_discard_ctl *discard_ctl;
495
	struct btrfs_block_group *block_group;
496
	enum btrfs_discard_state discard_state;
497
	int discard_index = 0;
498
	u64 trimmed = 0;
499
	u64 minlen = 0;
500
	u64 now = ktime_get_ns();
501

502
	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
503

504
	block_group = peek_discard_list(discard_ctl, &discard_state,
505
					&discard_index, now);
506
	if (!block_group)
507
		return;
508
	if (!btrfs_run_discard_work(discard_ctl)) {
509
		spin_lock(&discard_ctl->lock);
510
		btrfs_put_block_group(block_group);
511
		discard_ctl->block_group = NULL;
512
		spin_unlock(&discard_ctl->lock);
513
		return;
514
	}
515
	if (now < block_group->discard_eligible_time) {
516
		spin_lock(&discard_ctl->lock);
517
		btrfs_put_block_group(block_group);
518
		discard_ctl->block_group = NULL;
519
		spin_unlock(&discard_ctl->lock);
520
		btrfs_discard_schedule_work(discard_ctl, false);
521
		return;
522
	}
523

524
	/* Perform discarding */
525
	minlen = discard_minlen[discard_index];
526

527
	if (discard_state == BTRFS_DISCARD_BITMAPS) {
528
		u64 maxlen = 0;
529

530
		/*
531
		 * Use the previous levels minimum discard length as the max
532
		 * length filter.  In the case something is added to make a
533
		 * region go beyond the max filter, the entire bitmap is set
534
		 * back to BTRFS_TRIM_STATE_UNTRIMMED.
535
		 */
536
		if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
537
			maxlen = discard_minlen[discard_index - 1];
538

539
		btrfs_trim_block_group_bitmaps(block_group, &trimmed,
540
				       block_group->discard_cursor,
541
				       btrfs_block_group_end(block_group),
542
				       minlen, maxlen, true);
543
		discard_ctl->discard_bitmap_bytes += trimmed;
544
	} else {
545
		btrfs_trim_block_group_extents(block_group, &trimmed,
546
				       block_group->discard_cursor,
547
				       btrfs_block_group_end(block_group),
548
				       minlen, true);
549
		discard_ctl->discard_extent_bytes += trimmed;
550
	}
551

552
	/* Determine next steps for a block_group */
553
	if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
554
		if (discard_state == BTRFS_DISCARD_BITMAPS) {
555
			btrfs_finish_discard_pass(discard_ctl, block_group);
556
		} else {
557
			block_group->discard_cursor = block_group->start;
558
			spin_lock(&discard_ctl->lock);
559
			if (block_group->discard_state !=
560
			    BTRFS_DISCARD_RESET_CURSOR)
561
				block_group->discard_state =
562
							BTRFS_DISCARD_BITMAPS;
563
			spin_unlock(&discard_ctl->lock);
564
		}
565
	}
566

567
	now = ktime_get_ns();
568
	spin_lock(&discard_ctl->lock);
569
	discard_ctl->prev_discard = trimmed;
570
	discard_ctl->prev_discard_time = now;
571
	btrfs_put_block_group(block_group);
572
	discard_ctl->block_group = NULL;
573
	__btrfs_discard_schedule_work(discard_ctl, now, false);
574
	spin_unlock(&discard_ctl->lock);
575
}
576

577
/*
578
 * Recalculate the base delay.
579
 *
580
 * @discard_ctl: discard control
581
 *
582
 * Recalculate the base delay which is based off the total number of
583
 * discardable_extents.  Clamp this between the lower_limit (iops_limit or 1ms)
584
 * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
585
 */
586
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
587
{
588
	s32 discardable_extents;
589
	s64 discardable_bytes;
590
	u32 iops_limit;
591
	unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
592
	unsigned long delay;
593

594
	discardable_extents = atomic_read(&discard_ctl->discardable_extents);
595
	if (!discardable_extents)
596
		return;
597

598
	spin_lock(&discard_ctl->lock);
599

600
	/*
601
	 * The following is to fix a potential -1 discrepancy that we're not
602
	 * sure how to reproduce. But given that this is the only place that
603
	 * utilizes these numbers and this is only called by from
604
	 * btrfs_finish_extent_commit() which is synchronized, we can correct
605
	 * here.
606
	 */
607
	if (discardable_extents < 0)
608
		atomic_add(-discardable_extents,
609
			   &discard_ctl->discardable_extents);
610

611
	discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
612
	if (discardable_bytes < 0)
613
		atomic64_add(-discardable_bytes,
614
			     &discard_ctl->discardable_bytes);
615

616
	if (discardable_extents <= 0) {
617
		spin_unlock(&discard_ctl->lock);
618
		return;
619
	}
620

621
	iops_limit = READ_ONCE(discard_ctl->iops_limit);
622

623
	if (iops_limit) {
624
		delay = MSEC_PER_SEC / iops_limit;
625
	} else {
626
		/*
627
		 * Unset iops_limit means go as fast as possible, so allow a
628
		 * delay of 0.
629
		 */
630
		delay = 0;
631
		min_delay = 0;
632
	}
633

634
	delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
635
	discard_ctl->delay_ms = delay;
636

637
	spin_unlock(&discard_ctl->lock);
638
}
639

640
/*
641
 * Propagate discard counters.
642
 *
643
 * @block_group: block_group of interest
644
 *
645
 * Propagate deltas of counters up to the discard_ctl.  It maintains a current
646
 * counter and a previous counter passing the delta up to the global stat.
647
 * Then the current counter value becomes the previous counter value.
648
 */
649
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
650
{
651
	struct btrfs_free_space_ctl *ctl;
652
	struct btrfs_discard_ctl *discard_ctl;
653
	s32 extents_delta;
654
	s64 bytes_delta;
655

656
	if (!block_group ||
657
	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
658
	    !btrfs_is_block_group_data_only(block_group))
659
		return;
660

661
	ctl = block_group->free_space_ctl;
662
	discard_ctl = &block_group->fs_info->discard_ctl;
663

664
	lockdep_assert_held(&ctl->tree_lock);
665
	extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
666
			ctl->discardable_extents[BTRFS_STAT_PREV];
667
	if (extents_delta) {
668
		atomic_add(extents_delta, &discard_ctl->discardable_extents);
669
		ctl->discardable_extents[BTRFS_STAT_PREV] =
670
			ctl->discardable_extents[BTRFS_STAT_CURR];
671
	}
672

673
	bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
674
		      ctl->discardable_bytes[BTRFS_STAT_PREV];
675
	if (bytes_delta) {
676
		atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
677
		ctl->discardable_bytes[BTRFS_STAT_PREV] =
678
			ctl->discardable_bytes[BTRFS_STAT_CURR];
679
	}
680
}
681

682
/*
683
 * Punt unused_bgs list to discard lists.
684
 *
685
 * @fs_info: fs_info of interest
686
 *
687
 * The unused_bgs list needs to be punted to the discard lists because the
688
 * order of operations is changed.  In the normal synchronous discard path, the
689
 * block groups are trimmed via a single large trim in transaction commit.  This
690
 * is ultimately what we are trying to avoid with asynchronous discard.  Thus,
691
 * it must be done before going down the unused_bgs path.
692
 */
693
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
694
{
695
	struct btrfs_block_group *block_group, *next;
696

697
	spin_lock(&fs_info->unused_bgs_lock);
698
	/* We enabled async discard, so punt all to the queue */
699
	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
700
				 bg_list) {
701
		list_del_init(&block_group->bg_list);
702
		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
703
		/*
704
		 * This put is for the get done by btrfs_mark_bg_unused.
705
		 * Queueing discard incremented it for discard's reference.
706
		 */
707
		btrfs_put_block_group(block_group);
708
	}
709
	spin_unlock(&fs_info->unused_bgs_lock);
710
}
711

712
/*
713
 * Purge discard lists.
714
 *
715
 * @discard_ctl: discard control
716
 *
717
 * If we are disabling async discard, we may have intercepted block groups that
718
 * are completely free and ready for the unused_bgs path.  As discarding will
719
 * now happen in transaction commit or not at all, we can safely mark the
720
 * corresponding block groups as unused and they will be sent on their merry
721
 * way to the unused_bgs list.
722
 */
723
static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
724
{
725
	struct btrfs_block_group *block_group, *next;
726
	int i;
727

728
	spin_lock(&discard_ctl->lock);
729
	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
730
		list_for_each_entry_safe(block_group, next,
731
					 &discard_ctl->discard_list[i],
732
					 discard_list) {
733
			list_del_init(&block_group->discard_list);
734
			spin_unlock(&discard_ctl->lock);
735
			if (block_group->used == 0)
736
				btrfs_mark_bg_unused(block_group);
737
			spin_lock(&discard_ctl->lock);
738
			btrfs_put_block_group(block_group);
739
		}
740
	}
741
	spin_unlock(&discard_ctl->lock);
742
}
743

744
void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
745
{
746
	if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
747
		btrfs_discard_cleanup(fs_info);
748
		return;
749
	}
750

751
	btrfs_discard_punt_unused_bgs_list(fs_info);
752

753
	set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
754
}
755

756
void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
757
{
758
	clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
759
}
760

761
void btrfs_discard_init(struct btrfs_fs_info *fs_info)
762
{
763
	struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
764
	int i;
765

766
	spin_lock_init(&discard_ctl->lock);
767
	INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
768

769
	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
770
		INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
771

772
	discard_ctl->prev_discard = 0;
773
	discard_ctl->prev_discard_time = 0;
774
	atomic_set(&discard_ctl->discardable_extents, 0);
775
	atomic64_set(&discard_ctl->discardable_bytes, 0);
776
	discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
777
	discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
778
	discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
779
	discard_ctl->kbps_limit = 0;
780
	discard_ctl->discard_extent_bytes = 0;
781
	discard_ctl->discard_bitmap_bytes = 0;
782
	atomic64_set(&discard_ctl->discard_bytes_saved, 0);
783
}
784

785
void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
786
{
787
	btrfs_discard_stop(fs_info);
788
	cancel_delayed_work_sync(&fs_info->discard_ctl.work);
789
	btrfs_discard_purge_list(&fs_info->discard_ctl);
790
}
791

792
Product

Resources

Company