CoCalc -- defrag.c

GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/defrag.c
²⁶²⁷⁸ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Copyright (C) 2007 Oracle.  All rights reserved.
4
 */
5

6
#include <linux/sched.h>
7
#include "ctree.h"
8
#include "disk-io.h"
9
#include "transaction.h"
10
#include "locking.h"
11
#include "accessors.h"
12
#include "messages.h"
13
#include "delalloc-space.h"
14
#include "subpage.h"
15
#include "defrag.h"
16
#include "file-item.h"
17
#include "super.h"
18

19
static struct kmem_cache *btrfs_inode_defrag_cachep;
20

21
/*
22
 * When auto defrag is enabled we queue up these defrag structs to remember
23
 * which inodes need defragging passes.
24
 */
25
struct inode_defrag {
26
	struct rb_node rb_node;
27
	/* Inode number */
28
	u64 ino;
29
	/*
30
	 * Transid where the defrag was added, we search for extents newer than
31
	 * this.
32
	 */
33
	u64 transid;
34

35
	/* Root objectid */
36
	u64 root;
37

38
	/*
39
	 * The extent size threshold for autodefrag.
40
	 *
41
	 * This value is different for compressed/non-compressed extents, thus
42
	 * needs to be passed from higher layer.
43
	 * (aka, inode_should_defrag())
44
	 */
45
	u32 extent_thresh;
46
};
47

48
static int compare_inode_defrag(const struct inode_defrag *defrag1,
49
				const struct inode_defrag *defrag2)
50
{
51
	if (defrag1->root > defrag2->root)
52
		return 1;
53
	else if (defrag1->root < defrag2->root)
54
		return -1;
55
	else if (defrag1->ino > defrag2->ino)
56
		return 1;
57
	else if (defrag1->ino < defrag2->ino)
58
		return -1;
59
	else
60
		return 0;
61
}
62

63
static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing)
64
{
65
	const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node);
66
	const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node);
67

68
	return compare_inode_defrag(new_defrag, existing_defrag);
69
}
70

71
/*
72
 * Insert a record for an inode into the defrag tree.  The lock must be held
73
 * already.
74
 *
75
 * If you're inserting a record for an older transid than an existing record,
76
 * the transid already in the tree is lowered.
77
 */
78
static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
79
				     struct inode_defrag *defrag)
80
{
81
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
82
	struct rb_node *node;
83

84
	node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp);
85
	if (node) {
86
		struct inode_defrag *entry;
87

88
		entry = rb_entry(node, struct inode_defrag, rb_node);
89
		/*
90
		 * If we're reinserting an entry for an old defrag run, make
91
		 * sure to lower the transid of our existing record.
92
		 */
93
		if (defrag->transid < entry->transid)
94
			entry->transid = defrag->transid;
95
		entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh);
96
		return -EEXIST;
97
	}
98
	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
99
	return 0;
100
}
101

102
static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info)
103
{
104
	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
105
		return false;
106

107
	if (btrfs_fs_closing(fs_info))
108
		return false;
109

110
	return true;
111
}
112

113
/*
114
 * Insert a defrag record for this inode if auto defrag is enabled. No errors
115
 * returned as they're not considered fatal.
116
 */
117
void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
118
{
119
	struct btrfs_root *root = inode->root;
120
	struct btrfs_fs_info *fs_info = root->fs_info;
121
	struct inode_defrag *defrag;
122
	int ret;
123

124
	if (!need_auto_defrag(fs_info))
125
		return;
126

127
	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
128
		return;
129

130
	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
131
	if (!defrag)
132
		return;
133

134
	defrag->ino = btrfs_ino(inode);
135
	defrag->transid = btrfs_get_root_last_trans(root);
136
	defrag->root = btrfs_root_id(root);
137
	defrag->extent_thresh = extent_thresh;
138

139
	spin_lock(&fs_info->defrag_inodes_lock);
140
	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
141
		/*
142
		 * If we set IN_DEFRAG flag and evict the inode from memory,
143
		 * and then re-read this inode, this new inode doesn't have
144
		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
145
		 */
146
		ret = btrfs_insert_inode_defrag(inode, defrag);
147
		if (ret)
148
			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
149
	} else {
150
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
151
	}
152
	spin_unlock(&fs_info->defrag_inodes_lock);
153
}
154

155
/*
156
 * Pick the defragable inode that we want, if it doesn't exist, we will get the
157
 * next one.
158
 */
159
static struct inode_defrag *btrfs_pick_defrag_inode(
160
			struct btrfs_fs_info *fs_info, u64 root, u64 ino)
161
{
162
	struct inode_defrag *entry = NULL;
163
	struct inode_defrag tmp;
164
	struct rb_node *p;
165
	struct rb_node *parent = NULL;
166
	int ret;
167

168
	tmp.ino = ino;
169
	tmp.root = root;
170

171
	spin_lock(&fs_info->defrag_inodes_lock);
172
	p = fs_info->defrag_inodes.rb_node;
173
	while (p) {
174
		parent = p;
175
		entry = rb_entry(parent, struct inode_defrag, rb_node);
176

177
		ret = compare_inode_defrag(&tmp, entry);
178
		if (ret < 0)
179
			p = parent->rb_left;
180
		else if (ret > 0)
181
			p = parent->rb_right;
182
		else
183
			goto out;
184
	}
185

186
	if (parent && compare_inode_defrag(&tmp, entry) > 0) {
187
		parent = rb_next(parent);
188
		entry = rb_entry_safe(parent, struct inode_defrag, rb_node);
189
	}
190
out:
191
	if (entry)
192
		rb_erase(parent, &fs_info->defrag_inodes);
193
	spin_unlock(&fs_info->defrag_inodes_lock);
194
	return entry;
195
}
196

197
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
198
{
199
	struct inode_defrag *defrag, *next;
200

201
	spin_lock(&fs_info->defrag_inodes_lock);
202

203
	rbtree_postorder_for_each_entry_safe(defrag, next,
204
					     &fs_info->defrag_inodes, rb_node)
205
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
206

207
	fs_info->defrag_inodes = RB_ROOT;
208

209
	spin_unlock(&fs_info->defrag_inodes_lock);
210
}
211

212
#define BTRFS_DEFRAG_BATCH	1024
213

214
static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
215
				  struct inode_defrag *defrag,
216
				  struct file_ra_state *ra)
217
{
218
	struct btrfs_root *inode_root;
219
	struct btrfs_inode *inode;
220
	struct btrfs_ioctl_defrag_range_args range;
221
	int ret = 0;
222
	u64 cur = 0;
223

224
again:
225
	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
226
		goto cleanup;
227
	if (!need_auto_defrag(fs_info))
228
		goto cleanup;
229

230
	/* Get the inode */
231
	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
232
	if (IS_ERR(inode_root)) {
233
		ret = PTR_ERR(inode_root);
234
		goto cleanup;
235
	}
236

237
	inode = btrfs_iget(defrag->ino, inode_root);
238
	btrfs_put_root(inode_root);
239
	if (IS_ERR(inode)) {
240
		ret = PTR_ERR(inode);
241
		goto cleanup;
242
	}
243

244
	if (cur >= i_size_read(&inode->vfs_inode)) {
245
		iput(&inode->vfs_inode);
246
		goto cleanup;
247
	}
248

249
	/* Do a chunk of defrag */
250
	clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
251
	memset(&range, 0, sizeof(range));
252
	range.len = (u64)-1;
253
	range.start = cur;
254
	range.extent_thresh = defrag->extent_thresh;
255
	file_ra_state_init(ra, inode->vfs_inode.i_mapping);
256

257
	sb_start_write(fs_info->sb);
258
	ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
259
				BTRFS_DEFRAG_BATCH);
260
	sb_end_write(fs_info->sb);
261
	iput(&inode->vfs_inode);
262

263
	if (ret < 0)
264
		goto cleanup;
265

266
	cur = max(cur + fs_info->sectorsize, range.start);
267
	goto again;
268

269
cleanup:
270
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
271
	return ret;
272
}
273

274
/*
275
 * Run through the list of inodes in the FS that need defragging.
276
 */
277
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
278
{
279
	struct inode_defrag *defrag;
280
	u64 first_ino = 0;
281
	u64 root_objectid = 0;
282

283
	atomic_inc(&fs_info->defrag_running);
284
	while (1) {
285
		struct file_ra_state ra = { 0 };
286

287
		/* Pause the auto defragger. */
288
		if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
289
			break;
290

291
		if (!need_auto_defrag(fs_info))
292
			break;
293

294
		/* find an inode to defrag */
295
		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
296
		if (!defrag) {
297
			if (root_objectid || first_ino) {
298
				root_objectid = 0;
299
				first_ino = 0;
300
				continue;
301
			} else {
302
				break;
303
			}
304
		}
305

306
		first_ino = defrag->ino + 1;
307
		root_objectid = defrag->root;
308

309
		btrfs_run_defrag_inode(fs_info, defrag, &ra);
310
	}
311
	atomic_dec(&fs_info->defrag_running);
312

313
	/*
314
	 * During unmount, we use the transaction_wait queue to wait for the
315
	 * defragger to stop.
316
	 */
317
	wake_up(&fs_info->transaction_wait);
318
	return 0;
319
}
320

321
/*
322
 * Check if two blocks addresses are close, used by defrag.
323
 */
324
static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
325
{
326
	if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
327
		return true;
328
	if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
329
		return true;
330
	return false;
331
}
332

333
/*
334
 * Go through all the leaves pointed to by a node and reallocate them so that
335
 * disk order is close to key order.
336
 */
337
static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
338
			      struct btrfs_root *root,
339
			      struct extent_buffer *parent,
340
			      int start_slot, u64 *last_ret,
341
			      struct btrfs_key *progress)
342
{
343
	struct btrfs_fs_info *fs_info = root->fs_info;
344
	const u32 blocksize = fs_info->nodesize;
345
	const int end_slot = btrfs_header_nritems(parent) - 1;
346
	u64 search_start = *last_ret;
347
	u64 last_block = 0;
348
	int ret = 0;
349
	bool progress_passed = false;
350

351
	/*
352
	 * COWing must happen through a running transaction, which always
353
	 * matches the current fs generation (it's a transaction with a state
354
	 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
355
	 * into error state to prevent the commit of any transaction.
356
	 */
357
	if (unlikely(trans->transaction != fs_info->running_transaction ||
358
		     trans->transid != fs_info->generation)) {
359
		btrfs_abort_transaction(trans, -EUCLEAN);
360
		btrfs_crit(fs_info,
361
"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
362
			   parent->start, btrfs_root_id(root), trans->transid,
363
			   fs_info->running_transaction->transid,
364
			   fs_info->generation);
365
		return -EUCLEAN;
366
	}
367

368
	if (btrfs_header_nritems(parent) <= 1)
369
		return 0;
370

371
	for (int i = start_slot; i <= end_slot; i++) {
372
		struct extent_buffer *cur;
373
		struct btrfs_disk_key disk_key;
374
		u64 blocknr;
375
		u64 other;
376
		bool close = true;
377

378
		btrfs_node_key(parent, &disk_key, i);
379
		if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
380
			continue;
381

382
		progress_passed = true;
383
		blocknr = btrfs_node_blockptr(parent, i);
384
		if (last_block == 0)
385
			last_block = blocknr;
386

387
		if (i > 0) {
388
			other = btrfs_node_blockptr(parent, i - 1);
389
			close = close_blocks(blocknr, other, blocksize);
390
		}
391
		if (!close && i < end_slot) {
392
			other = btrfs_node_blockptr(parent, i + 1);
393
			close = close_blocks(blocknr, other, blocksize);
394
		}
395
		if (close) {
396
			last_block = blocknr;
397
			continue;
398
		}
399

400
		cur = btrfs_read_node_slot(parent, i);
401
		if (IS_ERR(cur))
402
			return PTR_ERR(cur);
403
		if (search_start == 0)
404
			search_start = last_block;
405

406
		btrfs_tree_lock(cur);
407
		ret = btrfs_force_cow_block(trans, root, cur, parent, i,
408
					    &cur, search_start,
409
					    min(16 * blocksize,
410
						(end_slot - i) * blocksize),
411
					    BTRFS_NESTING_COW);
412
		if (ret) {
413
			btrfs_tree_unlock(cur);
414
			free_extent_buffer(cur);
415
			break;
416
		}
417
		search_start = cur->start;
418
		last_block = cur->start;
419
		*last_ret = search_start;
420
		btrfs_tree_unlock(cur);
421
		free_extent_buffer(cur);
422
	}
423
	return ret;
424
}
425

426
/*
427
 * Defrag all the leaves in a given btree.
428
 * Read all the leaves and try to get key order to
429
 * better reflect disk order
430
 */
431

432
static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
433
			       struct btrfs_root *root)
434
{
435
	struct btrfs_path *path = NULL;
436
	struct btrfs_key key;
437
	int ret = 0;
438
	int wret;
439
	int level;
440
	int next_key_ret = 0;
441
	u64 last_ret = 0;
442

443
	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
444
		goto out;
445

446
	path = btrfs_alloc_path();
447
	if (!path) {
448
		ret = -ENOMEM;
449
		goto out;
450
	}
451

452
	level = btrfs_header_level(root->node);
453

454
	if (level == 0)
455
		goto out;
456

457
	if (root->defrag_progress.objectid == 0) {
458
		struct extent_buffer *root_node;
459
		u32 nritems;
460

461
		root_node = btrfs_lock_root_node(root);
462
		nritems = btrfs_header_nritems(root_node);
463
		root->defrag_max.objectid = 0;
464
		/* from above we know this is not a leaf */
465
		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
466
				      nritems - 1);
467
		btrfs_tree_unlock(root_node);
468
		free_extent_buffer(root_node);
469
		memset(&key, 0, sizeof(key));
470
	} else {
471
		memcpy(&key, &root->defrag_progress, sizeof(key));
472
	}
473

474
	path->keep_locks = 1;
475

476
	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
477
	if (ret < 0)
478
		goto out;
479
	if (ret > 0) {
480
		ret = 0;
481
		goto out;
482
	}
483
	btrfs_release_path(path);
484
	/*
485
	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
486
	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
487
	 * a deadlock (attempting to write lock an already write locked leaf).
488
	 */
489
	path->lowest_level = 1;
490
	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
491

492
	if (wret < 0) {
493
		ret = wret;
494
		goto out;
495
	}
496
	if (!path->nodes[1]) {
497
		ret = 0;
498
		goto out;
499
	}
500
	/*
501
	 * The node at level 1 must always be locked when our path has
502
	 * keep_locks set and lowest_level is 1, regardless of the value of
503
	 * path->slots[1].
504
	 */
505
	ASSERT(path->locks[1] != 0);
506
	ret = btrfs_realloc_node(trans, root,
507
				 path->nodes[1], 0,
508
				 &last_ret,
509
				 &root->defrag_progress);
510
	if (ret) {
511
		WARN_ON(ret == -EAGAIN);
512
		goto out;
513
	}
514
	/*
515
	 * Now that we reallocated the node we can find the next key. Note that
516
	 * btrfs_find_next_key() can release our path and do another search
517
	 * without COWing, this is because even with path->keep_locks = 1,
518
	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
519
	 * node when path->slots[node_level - 1] does not point to the last
520
	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
521
	 * we search for the next key after reallocating our node.
522
	 */
523
	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
524
	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
525
					   BTRFS_OLDEST_GENERATION);
526
	if (next_key_ret == 0) {
527
		memcpy(&root->defrag_progress, &key, sizeof(key));
528
		ret = -EAGAIN;
529
	}
530
out:
531
	btrfs_free_path(path);
532
	if (ret == -EAGAIN) {
533
		if (root->defrag_max.objectid > root->defrag_progress.objectid)
534
			goto done;
535
		if (root->defrag_max.type > root->defrag_progress.type)
536
			goto done;
537
		if (root->defrag_max.offset > root->defrag_progress.offset)
538
			goto done;
539
		ret = 0;
540
	}
541
done:
542
	if (ret != -EAGAIN)
543
		memset(&root->defrag_progress, 0,
544
		       sizeof(root->defrag_progress));
545

546
	return ret;
547
}
548

549
/*
550
 * Defrag a given btree.  Every leaf in the btree is read and defragmented.
551
 */
552
int btrfs_defrag_root(struct btrfs_root *root)
553
{
554
	struct btrfs_fs_info *fs_info = root->fs_info;
555
	int ret;
556

557
	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
558
		return 0;
559

560
	while (1) {
561
		struct btrfs_trans_handle *trans;
562

563
		trans = btrfs_start_transaction(root, 0);
564
		if (IS_ERR(trans)) {
565
			ret = PTR_ERR(trans);
566
			break;
567
		}
568

569
		ret = btrfs_defrag_leaves(trans, root);
570

571
		btrfs_end_transaction(trans);
572
		btrfs_btree_balance_dirty(fs_info);
573
		cond_resched();
574

575
		if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
576
			break;
577

578
		if (btrfs_defrag_cancelled(fs_info)) {
579
			btrfs_debug(fs_info, "defrag_root cancelled");
580
			ret = -EAGAIN;
581
			break;
582
		}
583
	}
584
	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
585
	return ret;
586
}
587

588
/*
589
 * Defrag specific helper to get an extent map.
590
 *
591
 * Differences between this and btrfs_get_extent() are:
592
 *
593
 * - No extent_map will be added to inode->extent_tree
594
 *   To reduce memory usage in the long run.
595
 *
596
 * - Extra optimization to skip file extents older than @newer_than
597
 *   By using btrfs_search_forward() we can skip entire file ranges that
598
 *   have extents created in past transactions, because btrfs_search_forward()
599
 *   will not visit leaves and nodes with a generation smaller than given
600
 *   minimal generation threshold (@newer_than).
601
 *
602
 * Return valid em if we find a file extent matching the requirement.
603
 * Return NULL if we can not find a file extent matching the requirement.
604
 *
605
 * Return ERR_PTR() for error.
606
 */
607
static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
608
					    u64 start, u64 newer_than)
609
{
610
	struct btrfs_root *root = inode->root;
611
	struct btrfs_file_extent_item *fi;
612
	struct btrfs_path path = { 0 };
613
	struct extent_map *em;
614
	struct btrfs_key key;
615
	u64 ino = btrfs_ino(inode);
616
	int ret;
617

618
	em = btrfs_alloc_extent_map();
619
	if (!em) {
620
		ret = -ENOMEM;
621
		goto err;
622
	}
623

624
	key.objectid = ino;
625
	key.type = BTRFS_EXTENT_DATA_KEY;
626
	key.offset = start;
627

628
	if (newer_than) {
629
		ret = btrfs_search_forward(root, &key, &path, newer_than);
630
		if (ret < 0)
631
			goto err;
632
		/* Can't find anything newer */
633
		if (ret > 0)
634
			goto not_found;
635
	} else {
636
		ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
637
		if (ret < 0)
638
			goto err;
639
	}
640
	if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
641
		/*
642
		 * If btrfs_search_slot() makes path to point beyond nritems,
643
		 * we should not have an empty leaf, as this inode must at
644
		 * least have its INODE_ITEM.
645
		 */
646
		ASSERT(btrfs_header_nritems(path.nodes[0]));
647
		path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
648
	}
649
	btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
650
	/* Perfect match, no need to go one slot back */
651
	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
652
	    key.offset == start)
653
		goto iterate;
654

655
	/* We didn't find a perfect match, needs to go one slot back */
656
	if (path.slots[0] > 0) {
657
		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
658
		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
659
			path.slots[0]--;
660
	}
661

662
iterate:
663
	/* Iterate through the path to find a file extent covering @start */
664
	while (true) {
665
		u64 extent_end;
666

667
		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
668
			goto next;
669

670
		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
671

672
		/*
673
		 * We may go one slot back to INODE_REF/XATTR item, then
674
		 * need to go forward until we reach an EXTENT_DATA.
675
		 * But we should still has the correct ino as key.objectid.
676
		 */
677
		if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
678
			goto next;
679

680
		/* It's beyond our target range, definitely not extent found */
681
		if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
682
			goto not_found;
683

684
		/*
685
		 *	|	|<- File extent ->|
686
		 *	\- start
687
		 *
688
		 * This means there is a hole between start and key.offset.
689
		 */
690
		if (key.offset > start) {
691
			em->start = start;
692
			em->disk_bytenr = EXTENT_MAP_HOLE;
693
			em->disk_num_bytes = 0;
694
			em->ram_bytes = 0;
695
			em->offset = 0;
696
			em->len = key.offset - start;
697
			break;
698
		}
699

700
		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
701
				    struct btrfs_file_extent_item);
702
		extent_end = btrfs_file_extent_end(&path);
703

704
		/*
705
		 *	|<- file extent ->|	|
706
		 *				\- start
707
		 *
708
		 * We haven't reached start, search next slot.
709
		 */
710
		if (extent_end <= start)
711
			goto next;
712

713
		/* Now this extent covers @start, convert it to em */
714
		btrfs_extent_item_to_extent_map(inode, &path, fi, em);
715
		break;
716
next:
717
		ret = btrfs_next_item(root, &path);
718
		if (ret < 0)
719
			goto err;
720
		if (ret > 0)
721
			goto not_found;
722
	}
723
	btrfs_release_path(&path);
724
	return em;
725

726
not_found:
727
	btrfs_release_path(&path);
728
	btrfs_free_extent_map(em);
729
	return NULL;
730

731
err:
732
	btrfs_release_path(&path);
733
	btrfs_free_extent_map(em);
734
	return ERR_PTR(ret);
735
}
736

737
static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
738
					       u64 newer_than, bool locked)
739
{
740
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
741
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
742
	struct extent_map *em;
743
	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
744

745
	/*
746
	 * Hopefully we have this extent in the tree already, try without the
747
	 * full extent lock.
748
	 */
749
	read_lock(&em_tree->lock);
750
	em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize);
751
	read_unlock(&em_tree->lock);
752

753
	/*
754
	 * We can get a merged extent, in that case, we need to re-search
755
	 * tree to get the original em for defrag.
756
	 *
757
	 * This is because even if we have adjacent extents that are contiguous
758
	 * and compatible (same type and flags), we still want to defrag them
759
	 * so that we use less metadata (extent items in the extent tree and
760
	 * file extent items in the inode's subvolume tree).
761
	 */
762
	if (em && (em->flags & EXTENT_FLAG_MERGED)) {
763
		btrfs_free_extent_map(em);
764
		em = NULL;
765
	}
766

767
	if (!em) {
768
		struct extent_state *cached = NULL;
769
		u64 end = start + sectorsize - 1;
770

771
		/* Get the big lock and read metadata off disk. */
772
		if (!locked)
773
			btrfs_lock_extent(io_tree, start, end, &cached);
774
		em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
775
		if (!locked)
776
			btrfs_unlock_extent(io_tree, start, end, &cached);
777

778
		if (IS_ERR(em))
779
			return NULL;
780
	}
781

782
	return em;
783
}
784

785
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
786
				   const struct extent_map *em)
787
{
788
	if (btrfs_extent_map_is_compressed(em))
789
		return BTRFS_MAX_COMPRESSED;
790
	return fs_info->max_extent_size;
791
}
792

793
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
794
				     u32 extent_thresh, u64 newer_than, bool locked)
795
{
796
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
797
	struct extent_map *next;
798
	bool ret = false;
799

800
	/* This is the last extent */
801
	if (em->start + em->len >= i_size_read(inode))
802
		return false;
803

804
	/*
805
	 * Here we need to pass @newer_then when checking the next extent, or
806
	 * we will hit a case we mark current extent for defrag, but the next
807
	 * one will not be a target.
808
	 * This will just cause extra IO without really reducing the fragments.
809
	 */
810
	next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
811
	/* No more em or hole */
812
	if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE)
813
		goto out;
814
	if (next->flags & EXTENT_FLAG_PREALLOC)
815
		goto out;
816
	/*
817
	 * If the next extent is at its max capacity, defragging current extent
818
	 * makes no sense, as the total number of extents won't change.
819
	 */
820
	if (next->len >= get_extent_max_capacity(fs_info, em))
821
		goto out;
822
	/* Skip older extent */
823
	if (next->generation < newer_than)
824
		goto out;
825
	/* Also check extent size */
826
	if (next->len >= extent_thresh)
827
		goto out;
828

829
	ret = true;
830
out:
831
	btrfs_free_extent_map(next);
832
	return ret;
833
}
834

835
/*
836
 * Prepare one page to be defragged.
837
 *
838
 * This will ensure:
839
 *
840
 * - Returned page is locked and has been set up properly.
841
 * - No ordered extent exists in the page.
842
 * - The page is uptodate.
843
 *
844
 * NOTE: Caller should also wait for page writeback after the cluster is
845
 * prepared, here we don't do writeback wait for each page.
846
 */
847
static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
848
{
849
	struct address_space *mapping = inode->vfs_inode.i_mapping;
850
	gfp_t mask = btrfs_alloc_write_mask(mapping);
851
	u64 lock_start;
852
	u64 lock_end;
853
	struct extent_state *cached_state = NULL;
854
	struct folio *folio;
855
	int ret;
856

857
again:
858
	/* TODO: Add order fgp order flags when large folios are fully enabled. */
859
	folio = __filemap_get_folio(mapping, index,
860
				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
861
	if (IS_ERR(folio))
862
		return folio;
863

864
	/*
865
	 * Since we can defragment files opened read-only, we can encounter
866
	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
867
	 *
868
	 * The IO for such large folios is not fully tested, thus return
869
	 * an error to reject such folios unless it's an experimental build.
870
	 *
871
	 * Filesystem transparent huge pages are typically only used for
872
	 * executables that explicitly enable them, so this isn't very
873
	 * restrictive.
874
	 */
875
	if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
876
		folio_unlock(folio);
877
		folio_put(folio);
878
		return ERR_PTR(-ETXTBSY);
879
	}
880

881
	ret = set_folio_extent_mapped(folio);
882
	if (ret < 0) {
883
		folio_unlock(folio);
884
		folio_put(folio);
885
		return ERR_PTR(ret);
886
	}
887

888
	lock_start = folio_pos(folio);
889
	lock_end = folio_end(folio) - 1;
890
	/* Wait for any existing ordered extent in the range */
891
	while (1) {
892
		struct btrfs_ordered_extent *ordered;
893

894
		btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
895
		ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio));
896
		btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
897
		if (!ordered)
898
			break;
899

900
		folio_unlock(folio);
901
		btrfs_start_ordered_extent(ordered);
902
		btrfs_put_ordered_extent(ordered);
903
		folio_lock(folio);
904
		/*
905
		 * We unlocked the folio above, so we need check if it was
906
		 * released or not.
907
		 */
908
		if (folio->mapping != mapping || !folio->private) {
909
			folio_unlock(folio);
910
			folio_put(folio);
911
			goto again;
912
		}
913
	}
914

915
	/*
916
	 * Now the page range has no ordered extent any more.  Read the page to
917
	 * make it uptodate.
918
	 */
919
	if (!folio_test_uptodate(folio)) {
920
		btrfs_read_folio(NULL, folio);
921
		folio_lock(folio);
922
		if (folio->mapping != mapping || !folio->private) {
923
			folio_unlock(folio);
924
			folio_put(folio);
925
			goto again;
926
		}
927
		if (!folio_test_uptodate(folio)) {
928
			folio_unlock(folio);
929
			folio_put(folio);
930
			return ERR_PTR(-EIO);
931
		}
932
	}
933
	return folio;
934
}
935

936
struct defrag_target_range {
937
	struct list_head list;
938
	u64 start;
939
	u64 len;
940
};
941

942
/*
943
 * Collect all valid target extents.
944
 *
945
 * @start:	   file offset to lookup
946
 * @len:	   length to lookup
947
 * @extent_thresh: file extent size threshold, any extent size >= this value
948
 *		   will be ignored
949
 * @newer_than:    only defrag extents newer than this value
950
 * @do_compress:   whether the defrag is doing compression or no-compression
951
 *		   if true, @extent_thresh will be ignored and all regular
952
 *		   file extents meeting @newer_than will be targets.
953
 * @locked:	   if the range has already held extent lock
954
 * @target_list:   list of targets file extents
955
 */
956
static int defrag_collect_targets(struct btrfs_inode *inode,
957
				  u64 start, u64 len, u32 extent_thresh,
958
				  u64 newer_than, bool do_compress,
959
				  bool locked, struct list_head *target_list,
960
				  u64 *last_scanned_ret)
961
{
962
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
963
	bool last_is_target = false;
964
	u64 cur = start;
965
	int ret = 0;
966

967
	while (cur < start + len) {
968
		struct extent_map *em;
969
		struct defrag_target_range *new;
970
		bool next_mergeable = true;
971
		u64 range_len;
972

973
		last_is_target = false;
974
		em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
975
		if (!em)
976
			break;
977

978
		/*
979
		 * If the file extent is an inlined one, we may still want to
980
		 * defrag it (fallthrough) if it will cause a regular extent.
981
		 * This is for users who want to convert inline extents to
982
		 * regular ones through max_inline= mount option.
983
		 */
984
		if (em->disk_bytenr == EXTENT_MAP_INLINE &&
985
		    em->len <= inode->root->fs_info->max_inline)
986
			goto next;
987

988
		/* Skip holes and preallocated extents. */
989
		if (em->disk_bytenr == EXTENT_MAP_HOLE ||
990
		    (em->flags & EXTENT_FLAG_PREALLOC))
991
			goto next;
992

993
		/* Skip older extent */
994
		if (em->generation < newer_than)
995
			goto next;
996

997
		/* This em is under writeback, no need to defrag */
998
		if (em->generation == (u64)-1)
999
			goto next;
1000

1001
		/*
1002
		 * Our start offset might be in the middle of an existing extent
1003
		 * map, so take that into account.
1004
		 */
1005
		range_len = em->len - (cur - em->start);
1006
		/*
1007
		 * If this range of the extent map is already flagged for delalloc,
1008
		 * skip it, because:
1009
		 *
1010
		 * 1) We could deadlock later, when trying to reserve space for
1011
		 *    delalloc, because in case we can't immediately reserve space
1012
		 *    the flusher can start delalloc and wait for the respective
1013
		 *    ordered extents to complete. The deadlock would happen
1014
		 *    because we do the space reservation while holding the range
1015
		 *    locked, and starting writeback, or finishing an ordered
1016
		 *    extent, requires locking the range;
1017
		 *
1018
		 * 2) If there's delalloc there, it means there's dirty pages for
1019
		 *    which writeback has not started yet (we clean the delalloc
1020
		 *    flag when starting writeback and after creating an ordered
1021
		 *    extent). If we mark pages in an adjacent range for defrag,
1022
		 *    then we will have a larger contiguous range for delalloc,
1023
		 *    very likely resulting in a larger extent after writeback is
1024
		 *    triggered (except in a case of free space fragmentation).
1025
		 */
1026
		if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
1027
						EXTENT_DELALLOC))
1028
			goto next;
1029

1030
		/*
1031
		 * For do_compress case, we want to compress all valid file
1032
		 * extents, thus no @extent_thresh or mergeable check.
1033
		 */
1034
		if (do_compress)
1035
			goto add;
1036

1037
		/* Skip too large extent */
1038
		if (em->len >= extent_thresh)
1039
			goto next;
1040

1041
		/*
1042
		 * Skip extents already at its max capacity, this is mostly for
1043
		 * compressed extents, which max cap is only 128K.
1044
		 */
1045
		if (em->len >= get_extent_max_capacity(fs_info, em))
1046
			goto next;
1047

1048
		/*
1049
		 * Normally there are no more extents after an inline one, thus
1050
		 * @next_mergeable will normally be false and not defragged.
1051
		 * So if an inline extent passed all above checks, just add it
1052
		 * for defrag, and be converted to regular extents.
1053
		 */
1054
		if (em->disk_bytenr == EXTENT_MAP_INLINE)
1055
			goto add;
1056

1057
		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
1058
						extent_thresh, newer_than, locked);
1059
		if (!next_mergeable) {
1060
			struct defrag_target_range *last;
1061

1062
			/* Empty target list, no way to merge with last entry */
1063
			if (list_empty(target_list))
1064
				goto next;
1065
			last = list_last_entry(target_list,
1066
					       struct defrag_target_range, list);
1067
			/* Not mergeable with last entry */
1068
			if (last->start + last->len != cur)
1069
				goto next;
1070

1071
			/* Mergeable, fall through to add it to @target_list. */
1072
		}
1073

1074
add:
1075
		last_is_target = true;
1076
		range_len = min(btrfs_extent_map_end(em), start + len) - cur;
1077
		/*
1078
		 * This one is a good target, check if it can be merged into
1079
		 * last range of the target list.
1080
		 */
1081
		if (!list_empty(target_list)) {
1082
			struct defrag_target_range *last;
1083

1084
			last = list_last_entry(target_list,
1085
					       struct defrag_target_range, list);
1086
			ASSERT(last->start + last->len <= cur);
1087
			if (last->start + last->len == cur) {
1088
				/* Mergeable, enlarge the last entry */
1089
				last->len += range_len;
1090
				goto next;
1091
			}
1092
			/* Fall through to allocate a new entry */
1093
		}
1094

1095
		/* Allocate new defrag_target_range */
1096
		new = kmalloc(sizeof(*new), GFP_NOFS);
1097
		if (!new) {
1098
			btrfs_free_extent_map(em);
1099
			ret = -ENOMEM;
1100
			break;
1101
		}
1102
		new->start = cur;
1103
		new->len = range_len;
1104
		list_add_tail(&new->list, target_list);
1105

1106
next:
1107
		cur = btrfs_extent_map_end(em);
1108
		btrfs_free_extent_map(em);
1109
	}
1110
	if (ret < 0) {
1111
		struct defrag_target_range *entry;
1112
		struct defrag_target_range *tmp;
1113

1114
		list_for_each_entry_safe(entry, tmp, target_list, list) {
1115
			list_del_init(&entry->list);
1116
			kfree(entry);
1117
		}
1118
	}
1119
	if (!ret && last_scanned_ret) {
1120
		/*
1121
		 * If the last extent is not a target, the caller can skip to
1122
		 * the end of that extent.
1123
		 * Otherwise, we can only go the end of the specified range.
1124
		 */
1125
		if (!last_is_target)
1126
			*last_scanned_ret = max(cur, *last_scanned_ret);
1127
		else
1128
			*last_scanned_ret = max(start + len, *last_scanned_ret);
1129
	}
1130
	return ret;
1131
}
1132

1133
#define CLUSTER_SIZE	(SZ_256K)
1134
static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
1135

1136
/*
1137
 * Defrag one contiguous target range.
1138
 *
1139
 * @inode:	target inode
1140
 * @target:	target range to defrag
1141
 * @pages:	locked pages covering the defrag range
1142
 * @nr_pages:	number of locked pages
1143
 *
1144
 * Caller should ensure:
1145
 *
1146
 * - Pages are prepared
1147
 *   Pages should be locked, no ordered extent in the pages range,
1148
 *   no writeback.
1149
 *
1150
 * - Extent bits are locked
1151
 */
1152
static int defrag_one_locked_target(struct btrfs_inode *inode,
1153
				    struct defrag_target_range *target,
1154
				    struct folio **folios, int nr_pages,
1155
				    struct extent_state **cached_state)
1156
{
1157
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1158
	struct extent_changeset *data_reserved = NULL;
1159
	const u64 start = target->start;
1160
	const u64 len = target->len;
1161
	int ret = 0;
1162

1163
	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
1164
	if (ret < 0)
1165
		return ret;
1166
	btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1,
1167
			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1168
			       EXTENT_DEFRAG, cached_state);
1169
	btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1,
1170
			     EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
1171

1172
	/*
1173
	 * Update the page status.
1174
	 * Due to possible large folios, we have to check all folios one by one.
1175
	 */
1176
	for (int i = 0; i < nr_pages && folios[i]; i++) {
1177
		struct folio *folio = folios[i];
1178

1179
		if (!folio)
1180
			break;
1181
		if (start >= folio_end(folio) || start + len <= folio_pos(folio))
1182
			continue;
1183
		btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
1184
		btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
1185
	}
1186
	btrfs_delalloc_release_extents(inode, len);
1187
	extent_changeset_free(data_reserved);
1188

1189
	return ret;
1190
}
1191

1192
static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
1193
			    u32 extent_thresh, u64 newer_than, bool do_compress,
1194
			    u64 *last_scanned_ret)
1195
{
1196
	struct extent_state *cached_state = NULL;
1197
	struct defrag_target_range *entry;
1198
	struct defrag_target_range *tmp;
1199
	LIST_HEAD(target_list);
1200
	struct folio **folios;
1201
	const u32 sectorsize = inode->root->fs_info->sectorsize;
1202
	u64 cur = start;
1203
	const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) -
1204
				      (start >> PAGE_SHIFT) + 1;
1205
	int ret = 0;
1206

1207
	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
1208
	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
1209

1210
	folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
1211
	if (!folios)
1212
		return -ENOMEM;
1213

1214
	/* Prepare all pages */
1215
	for (int i = 0; cur < start + len && i < nr_pages; i++) {
1216
		folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT);
1217
		if (IS_ERR(folios[i])) {
1218
			ret = PTR_ERR(folios[i]);
1219
			folios[i] = NULL;
1220
			goto free_folios;
1221
		}
1222
		cur = folio_end(folios[i]);
1223
	}
1224
	for (int i = 0; i < nr_pages; i++) {
1225
		if (!folios[i])
1226
			break;
1227
		folio_wait_writeback(folios[i]);
1228
	}
1229

1230
	/* We should get at least one folio. */
1231
	ASSERT(folios[0]);
1232
	/* Lock the pages range */
1233
	btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
1234
	/*
1235
	 * Now we have a consistent view about the extent map, re-check
1236
	 * which range really needs to be defragged.
1237
	 *
1238
	 * And this time we have extent locked already, pass @locked = true
1239
	 * so that we won't relock the extent range and cause deadlock.
1240
	 */
1241
	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1242
				     newer_than, do_compress, true,
1243
				     &target_list, last_scanned_ret);
1244
	if (ret < 0)
1245
		goto unlock_extent;
1246

1247
	list_for_each_entry(entry, &target_list, list) {
1248
		ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
1249
					       &cached_state);
1250
		if (ret < 0)
1251
			break;
1252
	}
1253

1254
	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1255
		list_del_init(&entry->list);
1256
		kfree(entry);
1257
	}
1258
unlock_extent:
1259
	btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
1260
free_folios:
1261
	for (int i = 0; i < nr_pages; i++) {
1262
		if (!folios[i])
1263
			break;
1264
		folio_unlock(folios[i]);
1265
		folio_put(folios[i]);
1266
	}
1267
	kfree(folios);
1268
	return ret;
1269
}
1270

1271
static int defrag_one_cluster(struct btrfs_inode *inode,
1272
			      struct file_ra_state *ra,
1273
			      u64 start, u32 len, u32 extent_thresh,
1274
			      u64 newer_than, bool do_compress,
1275
			      unsigned long *sectors_defragged,
1276
			      unsigned long max_sectors,
1277
			      u64 *last_scanned_ret)
1278
{
1279
	const u32 sectorsize = inode->root->fs_info->sectorsize;
1280
	struct defrag_target_range *entry;
1281
	struct defrag_target_range *tmp;
1282
	LIST_HEAD(target_list);
1283
	int ret;
1284

1285
	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1286
				     newer_than, do_compress, false,
1287
				     &target_list, NULL);
1288
	if (ret < 0)
1289
		goto out;
1290

1291
	list_for_each_entry(entry, &target_list, list) {
1292
		u32 range_len = entry->len;
1293

1294
		/* Reached or beyond the limit */
1295
		if (max_sectors && *sectors_defragged >= max_sectors) {
1296
			ret = 1;
1297
			break;
1298
		}
1299

1300
		if (max_sectors)
1301
			range_len = min_t(u32, range_len,
1302
				(max_sectors - *sectors_defragged) * sectorsize);
1303

1304
		/*
1305
		 * If defrag_one_range() has updated last_scanned_ret,
1306
		 * our range may already be invalid (e.g. hole punched).
1307
		 * Skip if our range is before last_scanned_ret, as there is
1308
		 * no need to defrag the range anymore.
1309
		 */
1310
		if (entry->start + range_len <= *last_scanned_ret)
1311
			continue;
1312

1313
		page_cache_sync_readahead(inode->vfs_inode.i_mapping,
1314
				ra, NULL, entry->start >> PAGE_SHIFT,
1315
				((entry->start + range_len - 1) >> PAGE_SHIFT) -
1316
				(entry->start >> PAGE_SHIFT) + 1);
1317
		/*
1318
		 * Here we may not defrag any range if holes are punched before
1319
		 * we locked the pages.
1320
		 * But that's fine, it only affects the @sectors_defragged
1321
		 * accounting.
1322
		 */
1323
		ret = defrag_one_range(inode, entry->start, range_len,
1324
				       extent_thresh, newer_than, do_compress,
1325
				       last_scanned_ret);
1326
		if (ret < 0)
1327
			break;
1328
		*sectors_defragged += range_len >>
1329
				      inode->root->fs_info->sectorsize_bits;
1330
	}
1331
out:
1332
	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1333
		list_del_init(&entry->list);
1334
		kfree(entry);
1335
	}
1336
	if (ret >= 0)
1337
		*last_scanned_ret = max(*last_scanned_ret, start + len);
1338
	return ret;
1339
}
1340

1341
/*
1342
 * Entry point to file defragmentation.
1343
 *
1344
 * @inode:	   inode to be defragged
1345
 * @ra:		   readahead state
1346
 * @range:	   defrag options including range and flags
1347
 * @newer_than:	   minimum transid to defrag
1348
 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1349
 *		   will be defragged.
1350
 *
1351
 * Return <0 for error.
1352
 * Return >=0 for the number of sectors defragged, and range->start will be updated
1353
 * to indicate the file offset where next defrag should be started at.
1354
 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1355
 *  defragging all the range).
1356
 */
1357
int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
1358
		      struct btrfs_ioctl_defrag_range_args *range,
1359
		      u64 newer_than, unsigned long max_to_defrag)
1360
{
1361
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1362
	unsigned long sectors_defragged = 0;
1363
	u64 isize = i_size_read(&inode->vfs_inode);
1364
	u64 cur;
1365
	u64 last_byte;
1366
	bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
1367
	bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS);
1368
	int compress_type = BTRFS_COMPRESS_ZLIB;
1369
	int compress_level = 0;
1370
	int ret = 0;
1371
	u32 extent_thresh = range->extent_thresh;
1372
	pgoff_t start_index;
1373

1374
	ASSERT(ra);
1375

1376
	if (isize == 0)
1377
		return 0;
1378

1379
	if (range->start >= isize)
1380
		return -EINVAL;
1381

1382
	if (do_compress) {
1383
		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) {
1384
			if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES)
1385
				return -EINVAL;
1386
			if (range->compress.type) {
1387
				compress_type  = range->compress.type;
1388
				compress_level = range->compress.level;
1389
				if (!btrfs_compress_level_valid(compress_type, compress_level))
1390
					return -EINVAL;
1391
			}
1392
		} else {
1393
			if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1394
				return -EINVAL;
1395
			if (range->compress_type)
1396
				compress_type = range->compress_type;
1397
		}
1398
	} else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) {
1399
		compress_type = BTRFS_DEFRAG_DONT_COMPRESS;
1400
		compress_level = 1;
1401
	}
1402

1403
	if (extent_thresh == 0)
1404
		extent_thresh = SZ_256K;
1405

1406
	if (range->start + range->len > range->start) {
1407
		/* Got a specific range */
1408
		last_byte = min(isize, range->start + range->len);
1409
	} else {
1410
		/* Defrag until file end */
1411
		last_byte = isize;
1412
	}
1413

1414
	/* Align the range */
1415
	cur = round_down(range->start, fs_info->sectorsize);
1416
	last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1417

1418
	/*
1419
	 * Make writeback start from the beginning of the range, so that the
1420
	 * defrag range can be written sequentially.
1421
	 */
1422
	start_index = cur >> PAGE_SHIFT;
1423
	if (start_index < inode->vfs_inode.i_mapping->writeback_index)
1424
		inode->vfs_inode.i_mapping->writeback_index = start_index;
1425

1426
	while (cur < last_byte) {
1427
		const unsigned long prev_sectors_defragged = sectors_defragged;
1428
		u64 last_scanned = cur;
1429
		u64 cluster_end;
1430

1431
		if (btrfs_defrag_cancelled(fs_info)) {
1432
			ret = -EAGAIN;
1433
			break;
1434
		}
1435

1436
		/* We want the cluster end at page boundary when possible */
1437
		cluster_end = (((cur >> PAGE_SHIFT) +
1438
			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
1439
		cluster_end = min(cluster_end, last_byte);
1440

1441
		btrfs_inode_lock(inode, 0);
1442
		if (IS_SWAPFILE(&inode->vfs_inode)) {
1443
			ret = -ETXTBSY;
1444
			btrfs_inode_unlock(inode, 0);
1445
			break;
1446
		}
1447
		if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) {
1448
			btrfs_inode_unlock(inode, 0);
1449
			break;
1450
		}
1451
		if (do_compress || no_compress) {
1452
			inode->defrag_compress = compress_type;
1453
			inode->defrag_compress_level = compress_level;
1454
		}
1455
		ret = defrag_one_cluster(inode, ra, cur,
1456
				cluster_end + 1 - cur, extent_thresh,
1457
				newer_than, do_compress || no_compress,
1458
				&sectors_defragged,
1459
				max_to_defrag, &last_scanned);
1460

1461
		if (sectors_defragged > prev_sectors_defragged)
1462
			balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping);
1463

1464
		btrfs_inode_unlock(inode, 0);
1465
		if (ret < 0)
1466
			break;
1467
		cur = max(cluster_end + 1, last_scanned);
1468
		if (ret > 0) {
1469
			ret = 0;
1470
			break;
1471
		}
1472
		cond_resched();
1473
	}
1474

1475
	/*
1476
	 * Update range.start for autodefrag, this will indicate where to start
1477
	 * in next run.
1478
	 */
1479
	range->start = cur;
1480
	if (sectors_defragged) {
1481
		/*
1482
		 * We have defragged some sectors, for compression case they
1483
		 * need to be written back immediately.
1484
		 */
1485
		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
1486
			filemap_flush(inode->vfs_inode.i_mapping);
1487
			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1488
				     &inode->runtime_flags))
1489
				filemap_flush(inode->vfs_inode.i_mapping);
1490
		}
1491
		if (range->compress_type == BTRFS_COMPRESS_LZO)
1492
			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1493
		else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
1494
			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
1495
		ret = sectors_defragged;
1496
	}
1497
	if (do_compress || no_compress) {
1498
		btrfs_inode_lock(inode, 0);
1499
		inode->defrag_compress = BTRFS_COMPRESS_NONE;
1500
		btrfs_inode_unlock(inode, 0);
1501
	}
1502
	return ret;
1503
}
1504

1505
void __cold btrfs_auto_defrag_exit(void)
1506
{
1507
	kmem_cache_destroy(btrfs_inode_defrag_cachep);
1508
}
1509

1510
int __init btrfs_auto_defrag_init(void)
1511
{
1512
	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
1513
					sizeof(struct inode_defrag), 0, 0, NULL);
1514
	if (!btrfs_inode_defrag_cachep)
1515
		return -ENOMEM;
1516

1517
	return 0;
1518
}
1519

1520
Product

Resources

Company