Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/ctree.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2007,2008 Oracle. All rights reserved.
4
*/
5
6
#include <linux/sched.h>
7
#include <linux/slab.h>
8
#include <linux/rbtree.h>
9
#include <linux/mm.h>
10
#include <linux/error-injection.h>
11
#include "messages.h"
12
#include "ctree.h"
13
#include "disk-io.h"
14
#include "transaction.h"
15
#include "print-tree.h"
16
#include "locking.h"
17
#include "volumes.h"
18
#include "qgroup.h"
19
#include "tree-mod-log.h"
20
#include "tree-checker.h"
21
#include "fs.h"
22
#include "accessors.h"
23
#include "extent-tree.h"
24
#include "relocation.h"
25
#include "file-item.h"
26
27
static struct kmem_cache *btrfs_path_cachep;
28
29
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30
*root, struct btrfs_path *path, int level);
31
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32
const struct btrfs_key *ins_key, struct btrfs_path *path,
33
int data_size, int extend);
34
static int push_node_left(struct btrfs_trans_handle *trans,
35
struct extent_buffer *dst,
36
struct extent_buffer *src, int empty);
37
static int balance_node_right(struct btrfs_trans_handle *trans,
38
struct extent_buffer *dst_buf,
39
struct extent_buffer *src_buf);
40
/*
41
* The leaf data grows from end-to-front in the node. this returns the address
42
* of the start of the last item, which is the stop of the leaf data stack.
43
*/
44
static unsigned int leaf_data_end(const struct extent_buffer *leaf)
45
{
46
u32 nr = btrfs_header_nritems(leaf);
47
48
if (nr == 0)
49
return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
50
return btrfs_item_offset(leaf, nr - 1);
51
}
52
53
/*
54
* Move data in a @leaf (using memmove, safe for overlapping ranges).
55
*
56
* @leaf: leaf that we're doing a memmove on
57
* @dst_offset: item data offset we're moving to
58
* @src_offset: item data offset were' moving from
59
* @len: length of the data we're moving
60
*
61
* Wrapper around memmove_extent_buffer() that takes into account the header on
62
* the leaf. The btrfs_item offset's start directly after the header, so we
63
* have to adjust any offsets to account for the header in the leaf. This
64
* handles that math to simplify the callers.
65
*/
66
static inline void memmove_leaf_data(const struct extent_buffer *leaf,
67
unsigned long dst_offset,
68
unsigned long src_offset,
69
unsigned long len)
70
{
71
memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
72
btrfs_item_nr_offset(leaf, 0) + src_offset, len);
73
}
74
75
/*
76
* Copy item data from @src into @dst at the given @offset.
77
*
78
* @dst: destination leaf that we're copying into
79
* @src: source leaf that we're copying from
80
* @dst_offset: item data offset we're copying to
81
* @src_offset: item data offset were' copying from
82
* @len: length of the data we're copying
83
*
84
* Wrapper around copy_extent_buffer() that takes into account the header on
85
* the leaf. The btrfs_item offset's start directly after the header, so we
86
* have to adjust any offsets to account for the header in the leaf. This
87
* handles that math to simplify the callers.
88
*/
89
static inline void copy_leaf_data(const struct extent_buffer *dst,
90
const struct extent_buffer *src,
91
unsigned long dst_offset,
92
unsigned long src_offset, unsigned long len)
93
{
94
copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
95
btrfs_item_nr_offset(src, 0) + src_offset, len);
96
}
97
98
/*
99
* Move items in a @leaf (using memmove).
100
*
101
* @dst: destination leaf for the items
102
* @dst_item: the item nr we're copying into
103
* @src_item: the item nr we're copying from
104
* @nr_items: the number of items to copy
105
*
106
* Wrapper around memmove_extent_buffer() that does the math to get the
107
* appropriate offsets into the leaf from the item numbers.
108
*/
109
static inline void memmove_leaf_items(const struct extent_buffer *leaf,
110
int dst_item, int src_item, int nr_items)
111
{
112
memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
113
btrfs_item_nr_offset(leaf, src_item),
114
nr_items * sizeof(struct btrfs_item));
115
}
116
117
/*
118
* Copy items from @src into @dst at the given @offset.
119
*
120
* @dst: destination leaf for the items
121
* @src: source leaf for the items
122
* @dst_item: the item nr we're copying into
123
* @src_item: the item nr we're copying from
124
* @nr_items: the number of items to copy
125
*
126
* Wrapper around copy_extent_buffer() that does the math to get the
127
* appropriate offsets into the leaf from the item numbers.
128
*/
129
static inline void copy_leaf_items(const struct extent_buffer *dst,
130
const struct extent_buffer *src,
131
int dst_item, int src_item, int nr_items)
132
{
133
copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
134
btrfs_item_nr_offset(src, src_item),
135
nr_items * sizeof(struct btrfs_item));
136
}
137
138
struct btrfs_path *btrfs_alloc_path(void)
139
{
140
might_sleep();
141
142
return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
143
}
144
145
/* this also releases the path */
146
void btrfs_free_path(struct btrfs_path *p)
147
{
148
if (!p)
149
return;
150
btrfs_release_path(p);
151
kmem_cache_free(btrfs_path_cachep, p);
152
}
153
154
/*
155
* path release drops references on the extent buffers in the path
156
* and it drops any locks held by this path
157
*
158
* It is safe to call this on paths that no locks or extent buffers held.
159
*/
160
noinline void btrfs_release_path(struct btrfs_path *p)
161
{
162
int i;
163
164
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
165
p->slots[i] = 0;
166
if (!p->nodes[i])
167
continue;
168
if (p->locks[i]) {
169
btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
170
p->locks[i] = 0;
171
}
172
free_extent_buffer(p->nodes[i]);
173
p->nodes[i] = NULL;
174
}
175
}
176
177
/*
178
* safely gets a reference on the root node of a tree. A lock
179
* is not taken, so a concurrent writer may put a different node
180
* at the root of the tree. See btrfs_lock_root_node for the
181
* looping required.
182
*
183
* The extent buffer returned by this has a reference taken, so
184
* it won't disappear. It may stop being the root of the tree
185
* at any time because there are no locks held.
186
*/
187
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
188
{
189
struct extent_buffer *eb;
190
191
while (1) {
192
rcu_read_lock();
193
eb = rcu_dereference(root->node);
194
195
/*
196
* RCU really hurts here, we could free up the root node because
197
* it was COWed but we may not get the new root node yet so do
198
* the inc_not_zero dance and if it doesn't work then
199
* synchronize_rcu and try again.
200
*/
201
if (refcount_inc_not_zero(&eb->refs)) {
202
rcu_read_unlock();
203
break;
204
}
205
rcu_read_unlock();
206
synchronize_rcu();
207
}
208
return eb;
209
}
210
211
/*
212
* Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
213
* just get put onto a simple dirty list. Transaction walks this list to make
214
* sure they get properly updated on disk.
215
*/
216
static void add_root_to_dirty_list(struct btrfs_root *root)
217
{
218
struct btrfs_fs_info *fs_info = root->fs_info;
219
220
if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
221
!test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
222
return;
223
224
spin_lock(&fs_info->trans_lock);
225
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
226
/* Want the extent tree to be the last on the list */
227
if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
228
list_move_tail(&root->dirty_list,
229
&fs_info->dirty_cowonly_roots);
230
else
231
list_move(&root->dirty_list,
232
&fs_info->dirty_cowonly_roots);
233
}
234
spin_unlock(&fs_info->trans_lock);
235
}
236
237
/*
238
* used by snapshot creation to make a copy of a root for a tree with
239
* a given objectid. The buffer with the new root node is returned in
240
* cow_ret, and this func returns zero on success or a negative error code.
241
*/
242
int btrfs_copy_root(struct btrfs_trans_handle *trans,
243
struct btrfs_root *root,
244
struct extent_buffer *buf,
245
struct extent_buffer **cow_ret, u64 new_root_objectid)
246
{
247
struct btrfs_fs_info *fs_info = root->fs_info;
248
struct extent_buffer *cow;
249
int ret = 0;
250
int level;
251
struct btrfs_disk_key disk_key;
252
u64 reloc_src_root = 0;
253
254
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
255
trans->transid != fs_info->running_transaction->transid);
256
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
257
trans->transid != btrfs_get_root_last_trans(root));
258
259
level = btrfs_header_level(buf);
260
if (level == 0)
261
btrfs_item_key(buf, &disk_key, 0);
262
else
263
btrfs_node_key(buf, &disk_key, 0);
264
265
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
266
reloc_src_root = btrfs_header_owner(buf);
267
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
268
&disk_key, level, buf->start, 0,
269
reloc_src_root, BTRFS_NESTING_NEW_ROOT);
270
if (IS_ERR(cow))
271
return PTR_ERR(cow);
272
273
copy_extent_buffer_full(cow, buf);
274
btrfs_set_header_bytenr(cow, cow->start);
275
btrfs_set_header_generation(cow, trans->transid);
276
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
277
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
278
BTRFS_HEADER_FLAG_RELOC);
279
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
280
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
281
else
282
btrfs_set_header_owner(cow, new_root_objectid);
283
284
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
285
286
if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
287
btrfs_tree_unlock(cow);
288
free_extent_buffer(cow);
289
ret = -EUCLEAN;
290
btrfs_abort_transaction(trans, ret);
291
return ret;
292
}
293
294
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
295
ret = btrfs_inc_ref(trans, root, cow, 1);
296
if (ret)
297
btrfs_abort_transaction(trans, ret);
298
} else {
299
ret = btrfs_inc_ref(trans, root, cow, 0);
300
if (ret)
301
btrfs_abort_transaction(trans, ret);
302
}
303
if (ret) {
304
btrfs_tree_unlock(cow);
305
free_extent_buffer(cow);
306
return ret;
307
}
308
309
btrfs_mark_buffer_dirty(trans, cow);
310
*cow_ret = cow;
311
return 0;
312
}
313
314
/*
315
* check if the tree block can be shared by multiple trees
316
*/
317
bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
318
const struct btrfs_root *root,
319
const struct extent_buffer *buf)
320
{
321
const u64 buf_gen = btrfs_header_generation(buf);
322
323
/*
324
* Tree blocks not in shareable trees and tree roots are never shared.
325
* If a block was allocated after the last snapshot and the block was
326
* not allocated by tree relocation, we know the block is not shared.
327
*/
328
329
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
330
return false;
331
332
if (buf == root->node)
333
return false;
334
335
if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
336
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
337
return false;
338
339
if (buf != root->commit_root)
340
return true;
341
342
/*
343
* An extent buffer that used to be the commit root may still be shared
344
* because the tree height may have increased and it became a child of a
345
* higher level root. This can happen when snapshotting a subvolume
346
* created in the current transaction.
347
*/
348
if (buf_gen == trans->transid)
349
return true;
350
351
return false;
352
}
353
354
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
355
struct btrfs_root *root,
356
struct extent_buffer *buf,
357
struct extent_buffer *cow,
358
int *last_ref)
359
{
360
struct btrfs_fs_info *fs_info = root->fs_info;
361
u64 refs;
362
u64 owner;
363
u64 flags;
364
int ret;
365
366
/*
367
* Backrefs update rules:
368
*
369
* Always use full backrefs for extent pointers in tree block
370
* allocated by tree relocation.
371
*
372
* If a shared tree block is no longer referenced by its owner
373
* tree (btrfs_header_owner(buf) == root->root_key.objectid),
374
* use full backrefs for extent pointers in tree block.
375
*
376
* If a tree block is been relocating
377
* (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
378
* use full backrefs for extent pointers in tree block.
379
* The reason for this is some operations (such as drop tree)
380
* are only allowed for blocks use full backrefs.
381
*/
382
383
if (btrfs_block_can_be_shared(trans, root, buf)) {
384
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
385
btrfs_header_level(buf), 1,
386
&refs, &flags, NULL);
387
if (ret)
388
return ret;
389
if (unlikely(refs == 0)) {
390
btrfs_crit(fs_info,
391
"found 0 references for tree block at bytenr %llu level %d root %llu",
392
buf->start, btrfs_header_level(buf),
393
btrfs_root_id(root));
394
ret = -EUCLEAN;
395
btrfs_abort_transaction(trans, ret);
396
return ret;
397
}
398
} else {
399
refs = 1;
400
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
401
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
402
flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
403
else
404
flags = 0;
405
}
406
407
owner = btrfs_header_owner(buf);
408
if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID &&
409
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) {
410
btrfs_crit(fs_info,
411
"found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set",
412
buf->start, btrfs_header_level(buf),
413
btrfs_root_id(root), refs, flags);
414
ret = -EUCLEAN;
415
btrfs_abort_transaction(trans, ret);
416
return ret;
417
}
418
419
if (refs > 1) {
420
if ((owner == btrfs_root_id(root) ||
421
btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
422
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
423
ret = btrfs_inc_ref(trans, root, buf, 1);
424
if (ret)
425
return ret;
426
427
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
428
ret = btrfs_dec_ref(trans, root, buf, 0);
429
if (ret)
430
return ret;
431
ret = btrfs_inc_ref(trans, root, cow, 1);
432
if (ret)
433
return ret;
434
}
435
ret = btrfs_set_disk_extent_flags(trans, buf,
436
BTRFS_BLOCK_FLAG_FULL_BACKREF);
437
if (ret)
438
return ret;
439
} else {
440
441
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
442
ret = btrfs_inc_ref(trans, root, cow, 1);
443
else
444
ret = btrfs_inc_ref(trans, root, cow, 0);
445
if (ret)
446
return ret;
447
}
448
} else {
449
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
450
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
451
ret = btrfs_inc_ref(trans, root, cow, 1);
452
else
453
ret = btrfs_inc_ref(trans, root, cow, 0);
454
if (ret)
455
return ret;
456
ret = btrfs_dec_ref(trans, root, buf, 1);
457
if (ret)
458
return ret;
459
}
460
btrfs_clear_buffer_dirty(trans, buf);
461
*last_ref = 1;
462
}
463
return 0;
464
}
465
466
/*
467
* does the dirty work in cow of a single block. The parent block (if
468
* supplied) is updated to point to the new cow copy. The new buffer is marked
469
* dirty and returned locked. If you modify the block it needs to be marked
470
* dirty again.
471
*
472
* search_start -- an allocation hint for the new block
473
*
474
* empty_size -- a hint that you plan on doing more cow. This is the size in
475
* bytes the allocator should try to find free next to the block it returns.
476
* This is just a hint and may be ignored by the allocator.
477
*/
478
int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
479
struct btrfs_root *root,
480
struct extent_buffer *buf,
481
struct extent_buffer *parent, int parent_slot,
482
struct extent_buffer **cow_ret,
483
u64 search_start, u64 empty_size,
484
enum btrfs_lock_nesting nest)
485
{
486
struct btrfs_fs_info *fs_info = root->fs_info;
487
struct btrfs_disk_key disk_key;
488
struct extent_buffer *cow;
489
int level, ret;
490
int last_ref = 0;
491
int unlock_orig = 0;
492
u64 parent_start = 0;
493
u64 reloc_src_root = 0;
494
495
if (*cow_ret == buf)
496
unlock_orig = 1;
497
498
btrfs_assert_tree_write_locked(buf);
499
500
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
501
trans->transid != fs_info->running_transaction->transid);
502
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
503
trans->transid != btrfs_get_root_last_trans(root));
504
505
level = btrfs_header_level(buf);
506
507
if (level == 0)
508
btrfs_item_key(buf, &disk_key, 0);
509
else
510
btrfs_node_key(buf, &disk_key, 0);
511
512
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
513
if (parent)
514
parent_start = parent->start;
515
reloc_src_root = btrfs_header_owner(buf);
516
}
517
cow = btrfs_alloc_tree_block(trans, root, parent_start,
518
btrfs_root_id(root), &disk_key, level,
519
search_start, empty_size, reloc_src_root, nest);
520
if (IS_ERR(cow))
521
return PTR_ERR(cow);
522
523
/* cow is set to blocking by btrfs_init_new_buffer */
524
525
copy_extent_buffer_full(cow, buf);
526
btrfs_set_header_bytenr(cow, cow->start);
527
btrfs_set_header_generation(cow, trans->transid);
528
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
529
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
530
BTRFS_HEADER_FLAG_RELOC);
531
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
532
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
533
else
534
btrfs_set_header_owner(cow, btrfs_root_id(root));
535
536
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
537
538
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
539
if (ret) {
540
btrfs_abort_transaction(trans, ret);
541
goto error_unlock_cow;
542
}
543
544
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
545
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
546
if (ret) {
547
btrfs_abort_transaction(trans, ret);
548
goto error_unlock_cow;
549
}
550
}
551
552
if (buf == root->node) {
553
WARN_ON(parent && parent != buf);
554
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
555
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
556
parent_start = buf->start;
557
558
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
559
if (ret < 0) {
560
btrfs_abort_transaction(trans, ret);
561
goto error_unlock_cow;
562
}
563
refcount_inc(&cow->refs);
564
rcu_assign_pointer(root->node, cow);
565
566
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
567
parent_start, last_ref);
568
free_extent_buffer(buf);
569
add_root_to_dirty_list(root);
570
if (ret < 0) {
571
btrfs_abort_transaction(trans, ret);
572
goto error_unlock_cow;
573
}
574
} else {
575
WARN_ON(trans->transid != btrfs_header_generation(parent));
576
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
577
BTRFS_MOD_LOG_KEY_REPLACE);
578
if (ret) {
579
btrfs_abort_transaction(trans, ret);
580
goto error_unlock_cow;
581
}
582
btrfs_set_node_blockptr(parent, parent_slot,
583
cow->start);
584
btrfs_set_node_ptr_generation(parent, parent_slot,
585
trans->transid);
586
btrfs_mark_buffer_dirty(trans, parent);
587
if (last_ref) {
588
ret = btrfs_tree_mod_log_free_eb(buf);
589
if (ret) {
590
btrfs_abort_transaction(trans, ret);
591
goto error_unlock_cow;
592
}
593
}
594
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
595
parent_start, last_ref);
596
if (ret < 0) {
597
btrfs_abort_transaction(trans, ret);
598
goto error_unlock_cow;
599
}
600
}
601
602
trace_btrfs_cow_block(root, buf, cow);
603
if (unlock_orig)
604
btrfs_tree_unlock(buf);
605
free_extent_buffer_stale(buf);
606
btrfs_mark_buffer_dirty(trans, cow);
607
*cow_ret = cow;
608
return 0;
609
610
error_unlock_cow:
611
btrfs_tree_unlock(cow);
612
free_extent_buffer(cow);
613
return ret;
614
}
615
616
static inline int should_cow_block(const struct btrfs_trans_handle *trans,
617
const struct btrfs_root *root,
618
const struct extent_buffer *buf)
619
{
620
if (btrfs_is_testing(root->fs_info))
621
return 0;
622
623
/* Ensure we can see the FORCE_COW bit */
624
smp_mb__before_atomic();
625
626
/*
627
* We do not need to cow a block if
628
* 1) this block is not created or changed in this transaction;
629
* 2) this block does not belong to TREE_RELOC tree;
630
* 3) the root is not forced COW.
631
*
632
* What is forced COW:
633
* when we create snapshot during committing the transaction,
634
* after we've finished copying src root, we must COW the shared
635
* block to ensure the metadata consistency.
636
*/
637
if (btrfs_header_generation(buf) == trans->transid &&
638
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
639
!(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
640
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
641
!test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
642
return 0;
643
return 1;
644
}
645
646
/*
647
* COWs a single block, see btrfs_force_cow_block() for the real work.
648
* This version of it has extra checks so that a block isn't COWed more than
649
* once per transaction, as long as it hasn't been written yet
650
*/
651
int btrfs_cow_block(struct btrfs_trans_handle *trans,
652
struct btrfs_root *root, struct extent_buffer *buf,
653
struct extent_buffer *parent, int parent_slot,
654
struct extent_buffer **cow_ret,
655
enum btrfs_lock_nesting nest)
656
{
657
struct btrfs_fs_info *fs_info = root->fs_info;
658
u64 search_start;
659
660
if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
661
btrfs_abort_transaction(trans, -EUCLEAN);
662
btrfs_crit(fs_info,
663
"attempt to COW block %llu on root %llu that is being deleted",
664
buf->start, btrfs_root_id(root));
665
return -EUCLEAN;
666
}
667
668
/*
669
* COWing must happen through a running transaction, which always
670
* matches the current fs generation (it's a transaction with a state
671
* less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
672
* into error state to prevent the commit of any transaction.
673
*/
674
if (unlikely(trans->transaction != fs_info->running_transaction ||
675
trans->transid != fs_info->generation)) {
676
btrfs_abort_transaction(trans, -EUCLEAN);
677
btrfs_crit(fs_info,
678
"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
679
buf->start, btrfs_root_id(root), trans->transid,
680
fs_info->running_transaction->transid,
681
fs_info->generation);
682
return -EUCLEAN;
683
}
684
685
if (!should_cow_block(trans, root, buf)) {
686
*cow_ret = buf;
687
return 0;
688
}
689
690
search_start = round_down(buf->start, SZ_1G);
691
692
/*
693
* Before CoWing this block for later modification, check if it's
694
* the subtree root and do the delayed subtree trace if needed.
695
*
696
* Also We don't care about the error, as it's handled internally.
697
*/
698
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
699
return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
700
cow_ret, search_start, 0, nest);
701
}
702
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
703
704
/*
705
* same as comp_keys only with two btrfs_key's
706
*/
707
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
708
{
709
if (k1->objectid > k2->objectid)
710
return 1;
711
if (k1->objectid < k2->objectid)
712
return -1;
713
if (k1->type > k2->type)
714
return 1;
715
if (k1->type < k2->type)
716
return -1;
717
if (k1->offset > k2->offset)
718
return 1;
719
if (k1->offset < k2->offset)
720
return -1;
721
return 0;
722
}
723
724
/*
725
* Search for a key in the given extent_buffer.
726
*
727
* The lower boundary for the search is specified by the slot number @first_slot.
728
* Use a value of 0 to search over the whole extent buffer. Works for both
729
* leaves and nodes.
730
*
731
* The slot in the extent buffer is returned via @slot. If the key exists in the
732
* extent buffer, then @slot will point to the slot where the key is, otherwise
733
* it points to the slot where you would insert the key.
734
*
735
* Slot may point to the total number of items (i.e. one position beyond the last
736
* key) if the key is bigger than the last key in the extent buffer.
737
*/
738
int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
739
const struct btrfs_key *key, int *slot)
740
{
741
unsigned long p;
742
int item_size;
743
/*
744
* Use unsigned types for the low and high slots, so that we get a more
745
* efficient division in the search loop below.
746
*/
747
u32 low = first_slot;
748
u32 high = btrfs_header_nritems(eb);
749
int ret;
750
const int key_size = sizeof(struct btrfs_disk_key);
751
752
if (unlikely(low > high)) {
753
btrfs_err(eb->fs_info,
754
"%s: low (%u) > high (%u) eb %llu owner %llu level %d",
755
__func__, low, high, eb->start,
756
btrfs_header_owner(eb), btrfs_header_level(eb));
757
return -EINVAL;
758
}
759
760
if (btrfs_header_level(eb) == 0) {
761
p = offsetof(struct btrfs_leaf, items);
762
item_size = sizeof(struct btrfs_item);
763
} else {
764
p = offsetof(struct btrfs_node, ptrs);
765
item_size = sizeof(struct btrfs_key_ptr);
766
}
767
768
while (low < high) {
769
const int unit_size = eb->folio_size;
770
unsigned long oil;
771
unsigned long offset;
772
struct btrfs_disk_key *tmp;
773
struct btrfs_disk_key unaligned;
774
int mid;
775
776
mid = (low + high) / 2;
777
offset = p + mid * item_size;
778
oil = get_eb_offset_in_folio(eb, offset);
779
780
if (oil + key_size <= unit_size) {
781
const unsigned long idx = get_eb_folio_index(eb, offset);
782
char *kaddr = folio_address(eb->folios[idx]);
783
784
oil = get_eb_offset_in_folio(eb, offset);
785
tmp = (struct btrfs_disk_key *)(kaddr + oil);
786
} else {
787
read_extent_buffer(eb, &unaligned, offset, key_size);
788
tmp = &unaligned;
789
}
790
791
ret = btrfs_comp_keys(tmp, key);
792
793
if (ret < 0)
794
low = mid + 1;
795
else if (ret > 0)
796
high = mid;
797
else {
798
*slot = mid;
799
return 0;
800
}
801
}
802
*slot = low;
803
return 1;
804
}
805
806
static void root_add_used_bytes(struct btrfs_root *root)
807
{
808
spin_lock(&root->accounting_lock);
809
btrfs_set_root_used(&root->root_item,
810
btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
811
spin_unlock(&root->accounting_lock);
812
}
813
814
static void root_sub_used_bytes(struct btrfs_root *root)
815
{
816
spin_lock(&root->accounting_lock);
817
btrfs_set_root_used(&root->root_item,
818
btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
819
spin_unlock(&root->accounting_lock);
820
}
821
822
/* given a node and slot number, this reads the blocks it points to. The
823
* extent buffer is returned with a reference taken (but unlocked).
824
*/
825
struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
826
int slot)
827
{
828
int level = btrfs_header_level(parent);
829
struct btrfs_tree_parent_check check = { 0 };
830
struct extent_buffer *eb;
831
832
if (slot < 0 || slot >= btrfs_header_nritems(parent))
833
return ERR_PTR(-ENOENT);
834
835
ASSERT(level);
836
837
check.level = level - 1;
838
check.transid = btrfs_node_ptr_generation(parent, slot);
839
check.owner_root = btrfs_header_owner(parent);
840
check.has_first_key = true;
841
btrfs_node_key_to_cpu(parent, &check.first_key, slot);
842
843
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
844
&check);
845
if (IS_ERR(eb))
846
return eb;
847
if (!extent_buffer_uptodate(eb)) {
848
free_extent_buffer(eb);
849
return ERR_PTR(-EIO);
850
}
851
852
return eb;
853
}
854
855
/*
856
* node level balancing, used to make sure nodes are in proper order for
857
* item deletion. We balance from the top down, so we have to make sure
858
* that a deletion won't leave an node completely empty later on.
859
*/
860
static noinline int balance_level(struct btrfs_trans_handle *trans,
861
struct btrfs_root *root,
862
struct btrfs_path *path, int level)
863
{
864
struct btrfs_fs_info *fs_info = root->fs_info;
865
struct extent_buffer *right = NULL;
866
struct extent_buffer *mid;
867
struct extent_buffer *left = NULL;
868
struct extent_buffer *parent = NULL;
869
int ret = 0;
870
int wret;
871
int pslot;
872
int orig_slot = path->slots[level];
873
u64 orig_ptr;
874
875
ASSERT(level > 0);
876
877
mid = path->nodes[level];
878
879
WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
880
WARN_ON(btrfs_header_generation(mid) != trans->transid);
881
882
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
883
884
if (level < BTRFS_MAX_LEVEL - 1) {
885
parent = path->nodes[level + 1];
886
pslot = path->slots[level + 1];
887
}
888
889
/*
890
* deal with the case where there is only one pointer in the root
891
* by promoting the node below to a root
892
*/
893
if (!parent) {
894
struct extent_buffer *child;
895
896
if (btrfs_header_nritems(mid) != 1)
897
return 0;
898
899
/* promote the child to a root */
900
child = btrfs_read_node_slot(mid, 0);
901
if (IS_ERR(child)) {
902
ret = PTR_ERR(child);
903
goto out;
904
}
905
906
btrfs_tree_lock(child);
907
ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
908
BTRFS_NESTING_COW);
909
if (ret) {
910
btrfs_tree_unlock(child);
911
free_extent_buffer(child);
912
goto out;
913
}
914
915
ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
916
if (ret < 0) {
917
btrfs_tree_unlock(child);
918
free_extent_buffer(child);
919
btrfs_abort_transaction(trans, ret);
920
goto out;
921
}
922
rcu_assign_pointer(root->node, child);
923
924
add_root_to_dirty_list(root);
925
btrfs_tree_unlock(child);
926
927
path->locks[level] = 0;
928
path->nodes[level] = NULL;
929
btrfs_clear_buffer_dirty(trans, mid);
930
btrfs_tree_unlock(mid);
931
/* once for the path */
932
free_extent_buffer(mid);
933
934
root_sub_used_bytes(root);
935
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
936
/* once for the root ptr */
937
free_extent_buffer_stale(mid);
938
if (ret < 0) {
939
btrfs_abort_transaction(trans, ret);
940
goto out;
941
}
942
return 0;
943
}
944
if (btrfs_header_nritems(mid) >
945
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
946
return 0;
947
948
if (pslot) {
949
left = btrfs_read_node_slot(parent, pslot - 1);
950
if (IS_ERR(left)) {
951
ret = PTR_ERR(left);
952
left = NULL;
953
goto out;
954
}
955
956
btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
957
wret = btrfs_cow_block(trans, root, left,
958
parent, pslot - 1, &left,
959
BTRFS_NESTING_LEFT_COW);
960
if (wret) {
961
ret = wret;
962
goto out;
963
}
964
}
965
966
if (pslot + 1 < btrfs_header_nritems(parent)) {
967
right = btrfs_read_node_slot(parent, pslot + 1);
968
if (IS_ERR(right)) {
969
ret = PTR_ERR(right);
970
right = NULL;
971
goto out;
972
}
973
974
btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
975
wret = btrfs_cow_block(trans, root, right,
976
parent, pslot + 1, &right,
977
BTRFS_NESTING_RIGHT_COW);
978
if (wret) {
979
ret = wret;
980
goto out;
981
}
982
}
983
984
/* first, try to make some room in the middle buffer */
985
if (left) {
986
orig_slot += btrfs_header_nritems(left);
987
wret = push_node_left(trans, left, mid, 1);
988
if (wret < 0)
989
ret = wret;
990
}
991
992
/*
993
* then try to empty the right most buffer into the middle
994
*/
995
if (right) {
996
wret = push_node_left(trans, mid, right, 1);
997
if (wret < 0 && wret != -ENOSPC)
998
ret = wret;
999
if (btrfs_header_nritems(right) == 0) {
1000
btrfs_clear_buffer_dirty(trans, right);
1001
btrfs_tree_unlock(right);
1002
ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
1003
if (ret < 0) {
1004
free_extent_buffer_stale(right);
1005
right = NULL;
1006
goto out;
1007
}
1008
root_sub_used_bytes(root);
1009
ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
1010
right, 0, 1);
1011
free_extent_buffer_stale(right);
1012
right = NULL;
1013
if (ret < 0) {
1014
btrfs_abort_transaction(trans, ret);
1015
goto out;
1016
}
1017
} else {
1018
struct btrfs_disk_key right_key;
1019
btrfs_node_key(right, &right_key, 0);
1020
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1021
BTRFS_MOD_LOG_KEY_REPLACE);
1022
if (ret < 0) {
1023
btrfs_abort_transaction(trans, ret);
1024
goto out;
1025
}
1026
btrfs_set_node_key(parent, &right_key, pslot + 1);
1027
btrfs_mark_buffer_dirty(trans, parent);
1028
}
1029
}
1030
if (btrfs_header_nritems(mid) == 1) {
1031
/*
1032
* we're not allowed to leave a node with one item in the
1033
* tree during a delete. A deletion from lower in the tree
1034
* could try to delete the only pointer in this node.
1035
* So, pull some keys from the left.
1036
* There has to be a left pointer at this point because
1037
* otherwise we would have pulled some pointers from the
1038
* right
1039
*/
1040
if (unlikely(!left)) {
1041
btrfs_crit(fs_info,
1042
"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1043
parent->start, btrfs_header_level(parent),
1044
mid->start, btrfs_root_id(root));
1045
ret = -EUCLEAN;
1046
btrfs_abort_transaction(trans, ret);
1047
goto out;
1048
}
1049
wret = balance_node_right(trans, mid, left);
1050
if (wret < 0) {
1051
ret = wret;
1052
goto out;
1053
}
1054
if (wret == 1) {
1055
wret = push_node_left(trans, left, mid, 1);
1056
if (wret < 0)
1057
ret = wret;
1058
}
1059
BUG_ON(wret == 1);
1060
}
1061
if (btrfs_header_nritems(mid) == 0) {
1062
btrfs_clear_buffer_dirty(trans, mid);
1063
btrfs_tree_unlock(mid);
1064
ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
1065
if (ret < 0) {
1066
free_extent_buffer_stale(mid);
1067
mid = NULL;
1068
goto out;
1069
}
1070
root_sub_used_bytes(root);
1071
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1072
free_extent_buffer_stale(mid);
1073
mid = NULL;
1074
if (ret < 0) {
1075
btrfs_abort_transaction(trans, ret);
1076
goto out;
1077
}
1078
} else {
1079
/* update the parent key to reflect our changes */
1080
struct btrfs_disk_key mid_key;
1081
btrfs_node_key(mid, &mid_key, 0);
1082
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1083
BTRFS_MOD_LOG_KEY_REPLACE);
1084
if (ret < 0) {
1085
btrfs_abort_transaction(trans, ret);
1086
goto out;
1087
}
1088
btrfs_set_node_key(parent, &mid_key, pslot);
1089
btrfs_mark_buffer_dirty(trans, parent);
1090
}
1091
1092
/* update the path */
1093
if (left) {
1094
if (btrfs_header_nritems(left) > orig_slot) {
1095
refcount_inc(&left->refs);
1096
/* left was locked after cow */
1097
path->nodes[level] = left;
1098
path->slots[level + 1] -= 1;
1099
path->slots[level] = orig_slot;
1100
if (mid) {
1101
btrfs_tree_unlock(mid);
1102
free_extent_buffer(mid);
1103
}
1104
} else {
1105
orig_slot -= btrfs_header_nritems(left);
1106
path->slots[level] = orig_slot;
1107
}
1108
}
1109
/* double check we haven't messed things up */
1110
if (orig_ptr !=
1111
btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1112
BUG();
1113
out:
1114
if (right) {
1115
btrfs_tree_unlock(right);
1116
free_extent_buffer(right);
1117
}
1118
if (left) {
1119
if (path->nodes[level] != left)
1120
btrfs_tree_unlock(left);
1121
free_extent_buffer(left);
1122
}
1123
return ret;
1124
}
1125
1126
/* Node balancing for insertion. Here we only split or push nodes around
1127
* when they are completely full. This is also done top down, so we
1128
* have to be pessimistic.
1129
*/
1130
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1131
struct btrfs_root *root,
1132
struct btrfs_path *path, int level)
1133
{
1134
struct btrfs_fs_info *fs_info = root->fs_info;
1135
struct extent_buffer *right = NULL;
1136
struct extent_buffer *mid;
1137
struct extent_buffer *left = NULL;
1138
struct extent_buffer *parent = NULL;
1139
int ret = 0;
1140
int wret;
1141
int pslot;
1142
int orig_slot = path->slots[level];
1143
1144
if (level == 0)
1145
return 1;
1146
1147
mid = path->nodes[level];
1148
WARN_ON(btrfs_header_generation(mid) != trans->transid);
1149
1150
if (level < BTRFS_MAX_LEVEL - 1) {
1151
parent = path->nodes[level + 1];
1152
pslot = path->slots[level + 1];
1153
}
1154
1155
if (!parent)
1156
return 1;
1157
1158
/* first, try to make some room in the middle buffer */
1159
if (pslot) {
1160
u32 left_nr;
1161
1162
left = btrfs_read_node_slot(parent, pslot - 1);
1163
if (IS_ERR(left))
1164
return PTR_ERR(left);
1165
1166
btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
1167
1168
left_nr = btrfs_header_nritems(left);
1169
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1170
wret = 1;
1171
} else {
1172
ret = btrfs_cow_block(trans, root, left, parent,
1173
pslot - 1, &left,
1174
BTRFS_NESTING_LEFT_COW);
1175
if (ret)
1176
wret = 1;
1177
else {
1178
wret = push_node_left(trans, left, mid, 0);
1179
}
1180
}
1181
if (wret < 0)
1182
ret = wret;
1183
if (wret == 0) {
1184
struct btrfs_disk_key disk_key;
1185
orig_slot += left_nr;
1186
btrfs_node_key(mid, &disk_key, 0);
1187
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1188
BTRFS_MOD_LOG_KEY_REPLACE);
1189
if (ret < 0) {
1190
btrfs_tree_unlock(left);
1191
free_extent_buffer(left);
1192
btrfs_abort_transaction(trans, ret);
1193
return ret;
1194
}
1195
btrfs_set_node_key(parent, &disk_key, pslot);
1196
btrfs_mark_buffer_dirty(trans, parent);
1197
if (btrfs_header_nritems(left) > orig_slot) {
1198
path->nodes[level] = left;
1199
path->slots[level + 1] -= 1;
1200
path->slots[level] = orig_slot;
1201
btrfs_tree_unlock(mid);
1202
free_extent_buffer(mid);
1203
} else {
1204
orig_slot -=
1205
btrfs_header_nritems(left);
1206
path->slots[level] = orig_slot;
1207
btrfs_tree_unlock(left);
1208
free_extent_buffer(left);
1209
}
1210
return 0;
1211
}
1212
btrfs_tree_unlock(left);
1213
free_extent_buffer(left);
1214
}
1215
1216
/*
1217
* then try to empty the right most buffer into the middle
1218
*/
1219
if (pslot + 1 < btrfs_header_nritems(parent)) {
1220
u32 right_nr;
1221
1222
right = btrfs_read_node_slot(parent, pslot + 1);
1223
if (IS_ERR(right))
1224
return PTR_ERR(right);
1225
1226
btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
1227
1228
right_nr = btrfs_header_nritems(right);
1229
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1230
wret = 1;
1231
} else {
1232
ret = btrfs_cow_block(trans, root, right,
1233
parent, pslot + 1,
1234
&right, BTRFS_NESTING_RIGHT_COW);
1235
if (ret)
1236
wret = 1;
1237
else {
1238
wret = balance_node_right(trans, right, mid);
1239
}
1240
}
1241
if (wret < 0)
1242
ret = wret;
1243
if (wret == 0) {
1244
struct btrfs_disk_key disk_key;
1245
1246
btrfs_node_key(right, &disk_key, 0);
1247
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1248
BTRFS_MOD_LOG_KEY_REPLACE);
1249
if (ret < 0) {
1250
btrfs_tree_unlock(right);
1251
free_extent_buffer(right);
1252
btrfs_abort_transaction(trans, ret);
1253
return ret;
1254
}
1255
btrfs_set_node_key(parent, &disk_key, pslot + 1);
1256
btrfs_mark_buffer_dirty(trans, parent);
1257
1258
if (btrfs_header_nritems(mid) <= orig_slot) {
1259
path->nodes[level] = right;
1260
path->slots[level + 1] += 1;
1261
path->slots[level] = orig_slot -
1262
btrfs_header_nritems(mid);
1263
btrfs_tree_unlock(mid);
1264
free_extent_buffer(mid);
1265
} else {
1266
btrfs_tree_unlock(right);
1267
free_extent_buffer(right);
1268
}
1269
return 0;
1270
}
1271
btrfs_tree_unlock(right);
1272
free_extent_buffer(right);
1273
}
1274
return 1;
1275
}
1276
1277
/*
1278
* readahead one full node of leaves, finding things that are close
1279
* to the block in 'slot', and triggering ra on them.
1280
*/
1281
static void reada_for_search(struct btrfs_fs_info *fs_info,
1282
const struct btrfs_path *path,
1283
int level, int slot, u64 objectid)
1284
{
1285
struct extent_buffer *node;
1286
struct btrfs_disk_key disk_key;
1287
u32 nritems;
1288
u64 search;
1289
u64 target;
1290
u64 nread = 0;
1291
u64 nread_max;
1292
u32 nr;
1293
u32 blocksize;
1294
u32 nscan = 0;
1295
1296
if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1297
return;
1298
1299
if (!path->nodes[level])
1300
return;
1301
1302
node = path->nodes[level];
1303
1304
/*
1305
* Since the time between visiting leaves is much shorter than the time
1306
* between visiting nodes, limit read ahead of nodes to 1, to avoid too
1307
* much IO at once (possibly random).
1308
*/
1309
if (path->reada == READA_FORWARD_ALWAYS) {
1310
if (level > 1)
1311
nread_max = node->fs_info->nodesize;
1312
else
1313
nread_max = SZ_128K;
1314
} else {
1315
nread_max = SZ_64K;
1316
}
1317
1318
search = btrfs_node_blockptr(node, slot);
1319
blocksize = fs_info->nodesize;
1320
if (path->reada != READA_FORWARD_ALWAYS) {
1321
struct extent_buffer *eb;
1322
1323
eb = find_extent_buffer(fs_info, search);
1324
if (eb) {
1325
free_extent_buffer(eb);
1326
return;
1327
}
1328
}
1329
1330
target = search;
1331
1332
nritems = btrfs_header_nritems(node);
1333
nr = slot;
1334
1335
while (1) {
1336
if (path->reada == READA_BACK) {
1337
if (nr == 0)
1338
break;
1339
nr--;
1340
} else if (path->reada == READA_FORWARD ||
1341
path->reada == READA_FORWARD_ALWAYS) {
1342
nr++;
1343
if (nr >= nritems)
1344
break;
1345
}
1346
if (path->reada == READA_BACK && objectid) {
1347
btrfs_node_key(node, &disk_key, nr);
1348
if (btrfs_disk_key_objectid(&disk_key) != objectid)
1349
break;
1350
}
1351
search = btrfs_node_blockptr(node, nr);
1352
if (path->reada == READA_FORWARD_ALWAYS ||
1353
(search <= target && target - search <= 65536) ||
1354
(search > target && search - target <= 65536)) {
1355
btrfs_readahead_node_child(node, nr);
1356
nread += blocksize;
1357
}
1358
nscan++;
1359
if (nread > nread_max || nscan > 32)
1360
break;
1361
}
1362
}
1363
1364
static noinline void reada_for_balance(const struct btrfs_path *path, int level)
1365
{
1366
struct extent_buffer *parent;
1367
int slot;
1368
int nritems;
1369
1370
parent = path->nodes[level + 1];
1371
if (!parent)
1372
return;
1373
1374
nritems = btrfs_header_nritems(parent);
1375
slot = path->slots[level + 1];
1376
1377
if (slot > 0)
1378
btrfs_readahead_node_child(parent, slot - 1);
1379
if (slot + 1 < nritems)
1380
btrfs_readahead_node_child(parent, slot + 1);
1381
}
1382
1383
1384
/*
1385
* when we walk down the tree, it is usually safe to unlock the higher layers
1386
* in the tree. The exceptions are when our path goes through slot 0, because
1387
* operations on the tree might require changing key pointers higher up in the
1388
* tree.
1389
*
1390
* callers might also have set path->keep_locks, which tells this code to keep
1391
* the lock if the path points to the last slot in the block. This is part of
1392
* walking through the tree, and selecting the next slot in the higher block.
1393
*
1394
* lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1395
* if lowest_unlock is 1, level 0 won't be unlocked
1396
*/
1397
static noinline void unlock_up(struct btrfs_path *path, int level,
1398
int lowest_unlock, int min_write_lock_level,
1399
int *write_lock_level)
1400
{
1401
int i;
1402
int skip_level = level;
1403
bool check_skip = true;
1404
1405
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1406
if (!path->nodes[i])
1407
break;
1408
if (!path->locks[i])
1409
break;
1410
1411
if (check_skip) {
1412
if (path->slots[i] == 0) {
1413
skip_level = i + 1;
1414
continue;
1415
}
1416
1417
if (path->keep_locks) {
1418
u32 nritems;
1419
1420
nritems = btrfs_header_nritems(path->nodes[i]);
1421
if (nritems < 1 || path->slots[i] >= nritems - 1) {
1422
skip_level = i + 1;
1423
continue;
1424
}
1425
}
1426
}
1427
1428
if (i >= lowest_unlock && i > skip_level) {
1429
check_skip = false;
1430
btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1431
path->locks[i] = 0;
1432
if (write_lock_level &&
1433
i > min_write_lock_level &&
1434
i <= *write_lock_level) {
1435
*write_lock_level = i - 1;
1436
}
1437
}
1438
}
1439
}
1440
1441
/*
1442
* Helper function for btrfs_search_slot() and other functions that do a search
1443
* on a btree. The goal is to find a tree block in the cache (the radix tree at
1444
* fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1445
* its pages from disk.
1446
*
1447
* Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1448
* whole btree search, starting again from the current root node.
1449
*/
1450
static int
1451
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1452
struct extent_buffer **eb_ret, int slot,
1453
const struct btrfs_key *key)
1454
{
1455
struct btrfs_fs_info *fs_info = root->fs_info;
1456
struct btrfs_tree_parent_check check = { 0 };
1457
u64 blocknr;
1458
struct extent_buffer *tmp = NULL;
1459
int ret = 0;
1460
int ret2;
1461
int parent_level;
1462
bool read_tmp = false;
1463
bool tmp_locked = false;
1464
bool path_released = false;
1465
1466
blocknr = btrfs_node_blockptr(*eb_ret, slot);
1467
parent_level = btrfs_header_level(*eb_ret);
1468
btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1469
check.has_first_key = true;
1470
check.level = parent_level - 1;
1471
check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
1472
check.owner_root = btrfs_root_id(root);
1473
1474
/*
1475
* If we need to read an extent buffer from disk and we are holding locks
1476
* on upper level nodes, we unlock all the upper nodes before reading the
1477
* extent buffer, and then return -EAGAIN to the caller as it needs to
1478
* restart the search. We don't release the lock on the current level
1479
* because we need to walk this node to figure out which blocks to read.
1480
*/
1481
tmp = find_extent_buffer(fs_info, blocknr);
1482
if (tmp) {
1483
if (p->reada == READA_FORWARD_ALWAYS)
1484
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
1485
1486
/* first we do an atomic uptodate check */
1487
if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
1488
/*
1489
* Do extra check for first_key, eb can be stale due to
1490
* being cached, read from scrub, or have multiple
1491
* parents (shared tree blocks).
1492
*/
1493
if (btrfs_verify_level_key(tmp, &check)) {
1494
ret = -EUCLEAN;
1495
goto out;
1496
}
1497
*eb_ret = tmp;
1498
tmp = NULL;
1499
ret = 0;
1500
goto out;
1501
}
1502
1503
if (p->nowait) {
1504
ret = -EAGAIN;
1505
goto out;
1506
}
1507
1508
if (!p->skip_locking) {
1509
btrfs_unlock_up_safe(p, parent_level + 1);
1510
btrfs_maybe_reset_lockdep_class(root, tmp);
1511
tmp_locked = true;
1512
btrfs_tree_read_lock(tmp);
1513
btrfs_release_path(p);
1514
ret = -EAGAIN;
1515
path_released = true;
1516
}
1517
1518
/* Now we're allowed to do a blocking uptodate check. */
1519
ret2 = btrfs_read_extent_buffer(tmp, &check);
1520
if (ret2) {
1521
ret = ret2;
1522
goto out;
1523
}
1524
1525
if (ret == 0) {
1526
ASSERT(!tmp_locked);
1527
*eb_ret = tmp;
1528
tmp = NULL;
1529
}
1530
goto out;
1531
} else if (p->nowait) {
1532
ret = -EAGAIN;
1533
goto out;
1534
}
1535
1536
if (!p->skip_locking) {
1537
btrfs_unlock_up_safe(p, parent_level + 1);
1538
ret = -EAGAIN;
1539
}
1540
1541
if (p->reada != READA_NONE)
1542
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
1543
1544
tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
1545
if (IS_ERR(tmp)) {
1546
ret = PTR_ERR(tmp);
1547
tmp = NULL;
1548
goto out;
1549
}
1550
read_tmp = true;
1551
1552
if (!p->skip_locking) {
1553
ASSERT(ret == -EAGAIN);
1554
btrfs_maybe_reset_lockdep_class(root, tmp);
1555
tmp_locked = true;
1556
btrfs_tree_read_lock(tmp);
1557
btrfs_release_path(p);
1558
path_released = true;
1559
}
1560
1561
/* Now we're allowed to do a blocking uptodate check. */
1562
ret2 = btrfs_read_extent_buffer(tmp, &check);
1563
if (ret2) {
1564
ret = ret2;
1565
goto out;
1566
}
1567
1568
/*
1569
* If the read above didn't mark this buffer up to date,
1570
* it will never end up being up to date. Set ret to EIO now
1571
* and give up so that our caller doesn't loop forever
1572
* on our EAGAINs.
1573
*/
1574
if (!extent_buffer_uptodate(tmp)) {
1575
ret = -EIO;
1576
goto out;
1577
}
1578
1579
if (ret == 0) {
1580
ASSERT(!tmp_locked);
1581
*eb_ret = tmp;
1582
tmp = NULL;
1583
}
1584
out:
1585
if (tmp) {
1586
if (tmp_locked)
1587
btrfs_tree_read_unlock(tmp);
1588
if (read_tmp && ret && ret != -EAGAIN)
1589
free_extent_buffer_stale(tmp);
1590
else
1591
free_extent_buffer(tmp);
1592
}
1593
if (ret && !path_released)
1594
btrfs_release_path(p);
1595
1596
return ret;
1597
}
1598
1599
/*
1600
* helper function for btrfs_search_slot. This does all of the checks
1601
* for node-level blocks and does any balancing required based on
1602
* the ins_len.
1603
*
1604
* If no extra work was required, zero is returned. If we had to
1605
* drop the path, -EAGAIN is returned and btrfs_search_slot must
1606
* start over
1607
*/
1608
static int
1609
setup_nodes_for_search(struct btrfs_trans_handle *trans,
1610
struct btrfs_root *root, struct btrfs_path *p,
1611
struct extent_buffer *b, int level, int ins_len,
1612
int *write_lock_level)
1613
{
1614
struct btrfs_fs_info *fs_info = root->fs_info;
1615
int ret = 0;
1616
1617
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1618
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1619
1620
if (*write_lock_level < level + 1) {
1621
*write_lock_level = level + 1;
1622
btrfs_release_path(p);
1623
return -EAGAIN;
1624
}
1625
1626
reada_for_balance(p, level);
1627
ret = split_node(trans, root, p, level);
1628
1629
b = p->nodes[level];
1630
} else if (ins_len < 0 && btrfs_header_nritems(b) <
1631
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1632
1633
if (*write_lock_level < level + 1) {
1634
*write_lock_level = level + 1;
1635
btrfs_release_path(p);
1636
return -EAGAIN;
1637
}
1638
1639
reada_for_balance(p, level);
1640
ret = balance_level(trans, root, p, level);
1641
if (ret)
1642
return ret;
1643
1644
b = p->nodes[level];
1645
if (!b) {
1646
btrfs_release_path(p);
1647
return -EAGAIN;
1648
}
1649
BUG_ON(btrfs_header_nritems(b) == 1);
1650
}
1651
return ret;
1652
}
1653
1654
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1655
u64 iobjectid, u64 ioff, u8 key_type,
1656
struct btrfs_key *found_key)
1657
{
1658
int ret;
1659
struct btrfs_key key;
1660
struct extent_buffer *eb;
1661
1662
ASSERT(path);
1663
ASSERT(found_key);
1664
1665
key.type = key_type;
1666
key.objectid = iobjectid;
1667
key.offset = ioff;
1668
1669
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1670
if (ret < 0)
1671
return ret;
1672
1673
eb = path->nodes[0];
1674
if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1675
ret = btrfs_next_leaf(fs_root, path);
1676
if (ret)
1677
return ret;
1678
eb = path->nodes[0];
1679
}
1680
1681
btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1682
if (found_key->type != key.type ||
1683
found_key->objectid != key.objectid)
1684
return 1;
1685
1686
return 0;
1687
}
1688
1689
static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1690
struct btrfs_path *p,
1691
int write_lock_level)
1692
{
1693
struct extent_buffer *b;
1694
int root_lock = 0;
1695
int level = 0;
1696
1697
if (p->search_commit_root) {
1698
b = root->commit_root;
1699
refcount_inc(&b->refs);
1700
level = btrfs_header_level(b);
1701
/*
1702
* Ensure that all callers have set skip_locking when
1703
* p->search_commit_root = 1.
1704
*/
1705
ASSERT(p->skip_locking == 1);
1706
1707
goto out;
1708
}
1709
1710
if (p->skip_locking) {
1711
b = btrfs_root_node(root);
1712
level = btrfs_header_level(b);
1713
goto out;
1714
}
1715
1716
/* We try very hard to do read locks on the root */
1717
root_lock = BTRFS_READ_LOCK;
1718
1719
/*
1720
* If the level is set to maximum, we can skip trying to get the read
1721
* lock.
1722
*/
1723
if (write_lock_level < BTRFS_MAX_LEVEL) {
1724
/*
1725
* We don't know the level of the root node until we actually
1726
* have it read locked
1727
*/
1728
if (p->nowait) {
1729
b = btrfs_try_read_lock_root_node(root);
1730
if (IS_ERR(b))
1731
return b;
1732
} else {
1733
b = btrfs_read_lock_root_node(root);
1734
}
1735
level = btrfs_header_level(b);
1736
if (level > write_lock_level)
1737
goto out;
1738
1739
/* Whoops, must trade for write lock */
1740
btrfs_tree_read_unlock(b);
1741
free_extent_buffer(b);
1742
}
1743
1744
b = btrfs_lock_root_node(root);
1745
root_lock = BTRFS_WRITE_LOCK;
1746
1747
/* The level might have changed, check again */
1748
level = btrfs_header_level(b);
1749
1750
out:
1751
/*
1752
* The root may have failed to write out at some point, and thus is no
1753
* longer valid, return an error in this case.
1754
*/
1755
if (!extent_buffer_uptodate(b)) {
1756
if (root_lock)
1757
btrfs_tree_unlock_rw(b, root_lock);
1758
free_extent_buffer(b);
1759
return ERR_PTR(-EIO);
1760
}
1761
1762
p->nodes[level] = b;
1763
if (!p->skip_locking)
1764
p->locks[level] = root_lock;
1765
/*
1766
* Callers are responsible for dropping b's references.
1767
*/
1768
return b;
1769
}
1770
1771
/*
1772
* Replace the extent buffer at the lowest level of the path with a cloned
1773
* version. The purpose is to be able to use it safely, after releasing the
1774
* commit root semaphore, even if relocation is happening in parallel, the
1775
* transaction used for relocation is committed and the extent buffer is
1776
* reallocated in the next transaction.
1777
*
1778
* This is used in a context where the caller does not prevent transaction
1779
* commits from happening, either by holding a transaction handle or holding
1780
* some lock, while it's doing searches through a commit root.
1781
* At the moment it's only used for send operations.
1782
*/
1783
static int finish_need_commit_sem_search(struct btrfs_path *path)
1784
{
1785
const int i = path->lowest_level;
1786
const int slot = path->slots[i];
1787
struct extent_buffer *lowest = path->nodes[i];
1788
struct extent_buffer *clone;
1789
1790
ASSERT(path->need_commit_sem);
1791
1792
if (!lowest)
1793
return 0;
1794
1795
lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1796
1797
clone = btrfs_clone_extent_buffer(lowest);
1798
if (!clone)
1799
return -ENOMEM;
1800
1801
btrfs_release_path(path);
1802
path->nodes[i] = clone;
1803
path->slots[i] = slot;
1804
1805
return 0;
1806
}
1807
1808
static inline int search_for_key_slot(const struct extent_buffer *eb,
1809
int search_low_slot,
1810
const struct btrfs_key *key,
1811
int prev_cmp,
1812
int *slot)
1813
{
1814
/*
1815
* If a previous call to btrfs_bin_search() on a parent node returned an
1816
* exact match (prev_cmp == 0), we can safely assume the target key will
1817
* always be at slot 0 on lower levels, since each key pointer
1818
* (struct btrfs_key_ptr) refers to the lowest key accessible from the
1819
* subtree it points to. Thus we can skip searching lower levels.
1820
*/
1821
if (prev_cmp == 0) {
1822
*slot = 0;
1823
return 0;
1824
}
1825
1826
return btrfs_bin_search(eb, search_low_slot, key, slot);
1827
}
1828
1829
static int search_leaf(struct btrfs_trans_handle *trans,
1830
struct btrfs_root *root,
1831
const struct btrfs_key *key,
1832
struct btrfs_path *path,
1833
int ins_len,
1834
int prev_cmp)
1835
{
1836
struct extent_buffer *leaf = path->nodes[0];
1837
int leaf_free_space = -1;
1838
int search_low_slot = 0;
1839
int ret;
1840
bool do_bin_search = true;
1841
1842
/*
1843
* If we are doing an insertion, the leaf has enough free space and the
1844
* destination slot for the key is not slot 0, then we can unlock our
1845
* write lock on the parent, and any other upper nodes, before doing the
1846
* binary search on the leaf (with search_for_key_slot()), allowing other
1847
* tasks to lock the parent and any other upper nodes.
1848
*/
1849
if (ins_len > 0) {
1850
/*
1851
* Cache the leaf free space, since we will need it later and it
1852
* will not change until then.
1853
*/
1854
leaf_free_space = btrfs_leaf_free_space(leaf);
1855
1856
/*
1857
* !path->locks[1] means we have a single node tree, the leaf is
1858
* the root of the tree.
1859
*/
1860
if (path->locks[1] && leaf_free_space >= ins_len) {
1861
struct btrfs_disk_key first_key;
1862
1863
ASSERT(btrfs_header_nritems(leaf) > 0);
1864
btrfs_item_key(leaf, &first_key, 0);
1865
1866
/*
1867
* Doing the extra comparison with the first key is cheap,
1868
* taking into account that the first key is very likely
1869
* already in a cache line because it immediately follows
1870
* the extent buffer's header and we have recently accessed
1871
* the header's level field.
1872
*/
1873
ret = btrfs_comp_keys(&first_key, key);
1874
if (ret < 0) {
1875
/*
1876
* The first key is smaller than the key we want
1877
* to insert, so we are safe to unlock all upper
1878
* nodes and we have to do the binary search.
1879
*
1880
* We do use btrfs_unlock_up_safe() and not
1881
* unlock_up() because the later does not unlock
1882
* nodes with a slot of 0 - we can safely unlock
1883
* any node even if its slot is 0 since in this
1884
* case the key does not end up at slot 0 of the
1885
* leaf and there's no need to split the leaf.
1886
*/
1887
btrfs_unlock_up_safe(path, 1);
1888
search_low_slot = 1;
1889
} else {
1890
/*
1891
* The first key is >= then the key we want to
1892
* insert, so we can skip the binary search as
1893
* the target key will be at slot 0.
1894
*
1895
* We can not unlock upper nodes when the key is
1896
* less than the first key, because we will need
1897
* to update the key at slot 0 of the parent node
1898
* and possibly of other upper nodes too.
1899
* If the key matches the first key, then we can
1900
* unlock all the upper nodes, using
1901
* btrfs_unlock_up_safe() instead of unlock_up()
1902
* as stated above.
1903
*/
1904
if (ret == 0)
1905
btrfs_unlock_up_safe(path, 1);
1906
/*
1907
* ret is already 0 or 1, matching the result of
1908
* a btrfs_bin_search() call, so there is no need
1909
* to adjust it.
1910
*/
1911
do_bin_search = false;
1912
path->slots[0] = 0;
1913
}
1914
}
1915
}
1916
1917
if (do_bin_search) {
1918
ret = search_for_key_slot(leaf, search_low_slot, key,
1919
prev_cmp, &path->slots[0]);
1920
if (ret < 0)
1921
return ret;
1922
}
1923
1924
if (ins_len > 0) {
1925
/*
1926
* Item key already exists. In this case, if we are allowed to
1927
* insert the item (for example, in dir_item case, item key
1928
* collision is allowed), it will be merged with the original
1929
* item. Only the item size grows, no new btrfs item will be
1930
* added. If search_for_extension is not set, ins_len already
1931
* accounts the size btrfs_item, deduct it here so leaf space
1932
* check will be correct.
1933
*/
1934
if (ret == 0 && !path->search_for_extension) {
1935
ASSERT(ins_len >= sizeof(struct btrfs_item));
1936
ins_len -= sizeof(struct btrfs_item);
1937
}
1938
1939
ASSERT(leaf_free_space >= 0);
1940
1941
if (leaf_free_space < ins_len) {
1942
int ret2;
1943
1944
ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
1945
ASSERT(ret2 <= 0);
1946
if (WARN_ON(ret2 > 0))
1947
ret2 = -EUCLEAN;
1948
if (ret2)
1949
ret = ret2;
1950
}
1951
}
1952
1953
return ret;
1954
}
1955
1956
/*
1957
* Look for a key in a tree and perform necessary modifications to preserve
1958
* tree invariants.
1959
*
1960
* @trans: Handle of transaction, used when modifying the tree
1961
* @p: Holds all btree nodes along the search path
1962
* @root: The root node of the tree
1963
* @key: The key we are looking for
1964
* @ins_len: Indicates purpose of search:
1965
* >0 for inserts it's size of item inserted (*)
1966
* <0 for deletions
1967
* 0 for plain searches, not modifying the tree
1968
*
1969
* (*) If size of item inserted doesn't include
1970
* sizeof(struct btrfs_item), then p->search_for_extension must
1971
* be set.
1972
* @cow: boolean should CoW operations be performed. Must always be 1
1973
* when modifying the tree.
1974
*
1975
* If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1976
* If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1977
*
1978
* If @key is found, 0 is returned and you can find the item in the leaf level
1979
* of the path (level 0)
1980
*
1981
* If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1982
* points to the slot where it should be inserted
1983
*
1984
* If an error is encountered while searching the tree a negative error number
1985
* is returned
1986
*/
1987
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1988
const struct btrfs_key *key, struct btrfs_path *p,
1989
int ins_len, int cow)
1990
{
1991
struct btrfs_fs_info *fs_info;
1992
struct extent_buffer *b;
1993
int slot;
1994
int ret;
1995
int level;
1996
int lowest_unlock = 1;
1997
/* everything at write_lock_level or lower must be write locked */
1998
int write_lock_level = 0;
1999
u8 lowest_level = 0;
2000
int min_write_lock_level;
2001
int prev_cmp;
2002
2003
if (!root)
2004
return -EINVAL;
2005
2006
fs_info = root->fs_info;
2007
might_sleep();
2008
2009
lowest_level = p->lowest_level;
2010
WARN_ON(lowest_level && ins_len > 0);
2011
WARN_ON(p->nodes[0] != NULL);
2012
BUG_ON(!cow && ins_len);
2013
2014
/*
2015
* For now only allow nowait for read only operations. There's no
2016
* strict reason why we can't, we just only need it for reads so it's
2017
* only implemented for reads.
2018
*/
2019
ASSERT(!p->nowait || !cow);
2020
2021
if (ins_len < 0) {
2022
lowest_unlock = 2;
2023
2024
/* when we are removing items, we might have to go up to level
2025
* two as we update tree pointers Make sure we keep write
2026
* for those levels as well
2027
*/
2028
write_lock_level = 2;
2029
} else if (ins_len > 0) {
2030
/*
2031
* for inserting items, make sure we have a write lock on
2032
* level 1 so we can update keys
2033
*/
2034
write_lock_level = 1;
2035
}
2036
2037
if (!cow)
2038
write_lock_level = -1;
2039
2040
if (cow && (p->keep_locks || p->lowest_level))
2041
write_lock_level = BTRFS_MAX_LEVEL;
2042
2043
min_write_lock_level = write_lock_level;
2044
2045
if (p->need_commit_sem) {
2046
ASSERT(p->search_commit_root);
2047
if (p->nowait) {
2048
if (!down_read_trylock(&fs_info->commit_root_sem))
2049
return -EAGAIN;
2050
} else {
2051
down_read(&fs_info->commit_root_sem);
2052
}
2053
}
2054
2055
again:
2056
prev_cmp = -1;
2057
b = btrfs_search_slot_get_root(root, p, write_lock_level);
2058
if (IS_ERR(b)) {
2059
ret = PTR_ERR(b);
2060
goto done;
2061
}
2062
2063
while (b) {
2064
int dec = 0;
2065
int ret2;
2066
2067
level = btrfs_header_level(b);
2068
2069
if (cow) {
2070
bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2071
2072
/*
2073
* if we don't really need to cow this block
2074
* then we don't want to set the path blocking,
2075
* so we test it here
2076
*/
2077
if (!should_cow_block(trans, root, b))
2078
goto cow_done;
2079
2080
/*
2081
* must have write locks on this node and the
2082
* parent
2083
*/
2084
if (level > write_lock_level ||
2085
(level + 1 > write_lock_level &&
2086
level + 1 < BTRFS_MAX_LEVEL &&
2087
p->nodes[level + 1])) {
2088
write_lock_level = level + 1;
2089
btrfs_release_path(p);
2090
goto again;
2091
}
2092
2093
if (last_level)
2094
ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
2095
&b, BTRFS_NESTING_COW);
2096
else
2097
ret2 = btrfs_cow_block(trans, root, b,
2098
p->nodes[level + 1],
2099
p->slots[level + 1], &b,
2100
BTRFS_NESTING_COW);
2101
if (ret2) {
2102
ret = ret2;
2103
goto done;
2104
}
2105
}
2106
cow_done:
2107
p->nodes[level] = b;
2108
2109
/*
2110
* we have a lock on b and as long as we aren't changing
2111
* the tree, there is no way to for the items in b to change.
2112
* It is safe to drop the lock on our parent before we
2113
* go through the expensive btree search on b.
2114
*
2115
* If we're inserting or deleting (ins_len != 0), then we might
2116
* be changing slot zero, which may require changing the parent.
2117
* So, we can't drop the lock until after we know which slot
2118
* we're operating on.
2119
*/
2120
if (!ins_len && !p->keep_locks) {
2121
int u = level + 1;
2122
2123
if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2124
btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2125
p->locks[u] = 0;
2126
}
2127
}
2128
2129
if (level == 0) {
2130
if (ins_len > 0)
2131
ASSERT(write_lock_level >= 1);
2132
2133
ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2134
if (!p->search_for_split)
2135
unlock_up(p, level, lowest_unlock,
2136
min_write_lock_level, NULL);
2137
goto done;
2138
}
2139
2140
ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2141
if (ret < 0)
2142
goto done;
2143
prev_cmp = ret;
2144
2145
if (ret && slot > 0) {
2146
dec = 1;
2147
slot--;
2148
}
2149
p->slots[level] = slot;
2150
ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2151
&write_lock_level);
2152
if (ret2 == -EAGAIN)
2153
goto again;
2154
if (ret2) {
2155
ret = ret2;
2156
goto done;
2157
}
2158
b = p->nodes[level];
2159
slot = p->slots[level];
2160
2161
/*
2162
* Slot 0 is special, if we change the key we have to update
2163
* the parent pointer which means we must have a write lock on
2164
* the parent
2165
*/
2166
if (slot == 0 && ins_len && write_lock_level < level + 1) {
2167
write_lock_level = level + 1;
2168
btrfs_release_path(p);
2169
goto again;
2170
}
2171
2172
unlock_up(p, level, lowest_unlock, min_write_lock_level,
2173
&write_lock_level);
2174
2175
if (level == lowest_level) {
2176
if (dec)
2177
p->slots[level]++;
2178
goto done;
2179
}
2180
2181
ret2 = read_block_for_search(root, p, &b, slot, key);
2182
if (ret2 == -EAGAIN && !p->nowait)
2183
goto again;
2184
if (ret2) {
2185
ret = ret2;
2186
goto done;
2187
}
2188
2189
if (!p->skip_locking) {
2190
level = btrfs_header_level(b);
2191
2192
btrfs_maybe_reset_lockdep_class(root, b);
2193
2194
if (level <= write_lock_level) {
2195
btrfs_tree_lock(b);
2196
p->locks[level] = BTRFS_WRITE_LOCK;
2197
} else {
2198
if (p->nowait) {
2199
if (!btrfs_try_tree_read_lock(b)) {
2200
free_extent_buffer(b);
2201
ret = -EAGAIN;
2202
goto done;
2203
}
2204
} else {
2205
btrfs_tree_read_lock(b);
2206
}
2207
p->locks[level] = BTRFS_READ_LOCK;
2208
}
2209
p->nodes[level] = b;
2210
}
2211
}
2212
ret = 1;
2213
done:
2214
if (ret < 0 && !p->skip_release_on_error)
2215
btrfs_release_path(p);
2216
2217
if (p->need_commit_sem) {
2218
int ret2;
2219
2220
ret2 = finish_need_commit_sem_search(p);
2221
up_read(&fs_info->commit_root_sem);
2222
if (ret2)
2223
ret = ret2;
2224
}
2225
2226
return ret;
2227
}
2228
ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2229
2230
/*
2231
* Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2232
* current state of the tree together with the operations recorded in the tree
2233
* modification log to search for the key in a previous version of this tree, as
2234
* denoted by the time_seq parameter.
2235
*
2236
* Naturally, there is no support for insert, delete or cow operations.
2237
*
2238
* The resulting path and return value will be set up as if we called
2239
* btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2240
*/
2241
int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2242
struct btrfs_path *p, u64 time_seq)
2243
{
2244
struct btrfs_fs_info *fs_info = root->fs_info;
2245
struct extent_buffer *b;
2246
int slot;
2247
int ret;
2248
int level;
2249
int lowest_unlock = 1;
2250
u8 lowest_level = 0;
2251
2252
lowest_level = p->lowest_level;
2253
WARN_ON(p->nodes[0] != NULL);
2254
ASSERT(!p->nowait);
2255
2256
if (p->search_commit_root) {
2257
BUG_ON(time_seq);
2258
return btrfs_search_slot(NULL, root, key, p, 0, 0);
2259
}
2260
2261
again:
2262
b = btrfs_get_old_root(root, time_seq);
2263
if (!b) {
2264
ret = -EIO;
2265
goto done;
2266
}
2267
level = btrfs_header_level(b);
2268
p->locks[level] = BTRFS_READ_LOCK;
2269
2270
while (b) {
2271
int dec = 0;
2272
int ret2;
2273
2274
level = btrfs_header_level(b);
2275
p->nodes[level] = b;
2276
2277
/*
2278
* we have a lock on b and as long as we aren't changing
2279
* the tree, there is no way to for the items in b to change.
2280
* It is safe to drop the lock on our parent before we
2281
* go through the expensive btree search on b.
2282
*/
2283
btrfs_unlock_up_safe(p, level + 1);
2284
2285
ret = btrfs_bin_search(b, 0, key, &slot);
2286
if (ret < 0)
2287
goto done;
2288
2289
if (level == 0) {
2290
p->slots[level] = slot;
2291
unlock_up(p, level, lowest_unlock, 0, NULL);
2292
goto done;
2293
}
2294
2295
if (ret && slot > 0) {
2296
dec = 1;
2297
slot--;
2298
}
2299
p->slots[level] = slot;
2300
unlock_up(p, level, lowest_unlock, 0, NULL);
2301
2302
if (level == lowest_level) {
2303
if (dec)
2304
p->slots[level]++;
2305
goto done;
2306
}
2307
2308
ret2 = read_block_for_search(root, p, &b, slot, key);
2309
if (ret2 == -EAGAIN && !p->nowait)
2310
goto again;
2311
if (ret2) {
2312
ret = ret2;
2313
goto done;
2314
}
2315
2316
level = btrfs_header_level(b);
2317
btrfs_tree_read_lock(b);
2318
b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
2319
if (!b) {
2320
ret = -ENOMEM;
2321
goto done;
2322
}
2323
p->locks[level] = BTRFS_READ_LOCK;
2324
p->nodes[level] = b;
2325
}
2326
ret = 1;
2327
done:
2328
if (ret < 0)
2329
btrfs_release_path(p);
2330
2331
return ret;
2332
}
2333
2334
/*
2335
* Search the tree again to find a leaf with smaller keys.
2336
* Returns 0 if it found something.
2337
* Returns 1 if there are no smaller keys.
2338
* Returns < 0 on error.
2339
*
2340
* This may release the path, and so you may lose any locks held at the
2341
* time you call it.
2342
*/
2343
static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2344
{
2345
struct btrfs_key key;
2346
struct btrfs_key orig_key;
2347
struct btrfs_disk_key found_key;
2348
int ret;
2349
2350
btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2351
orig_key = key;
2352
2353
if (key.offset > 0) {
2354
key.offset--;
2355
} else if (key.type > 0) {
2356
key.type--;
2357
key.offset = (u64)-1;
2358
} else if (key.objectid > 0) {
2359
key.objectid--;
2360
key.type = (u8)-1;
2361
key.offset = (u64)-1;
2362
} else {
2363
return 1;
2364
}
2365
2366
btrfs_release_path(path);
2367
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2368
if (ret <= 0)
2369
return ret;
2370
2371
/*
2372
* Previous key not found. Even if we were at slot 0 of the leaf we had
2373
* before releasing the path and calling btrfs_search_slot(), we now may
2374
* be in a slot pointing to the same original key - this can happen if
2375
* after we released the path, one of more items were moved from a
2376
* sibling leaf into the front of the leaf we had due to an insertion
2377
* (see push_leaf_right()).
2378
* If we hit this case and our slot is > 0 and just decrement the slot
2379
* so that the caller does not process the same key again, which may or
2380
* may not break the caller, depending on its logic.
2381
*/
2382
if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2383
btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2384
ret = btrfs_comp_keys(&found_key, &orig_key);
2385
if (ret == 0) {
2386
if (path->slots[0] > 0) {
2387
path->slots[0]--;
2388
return 0;
2389
}
2390
/*
2391
* At slot 0, same key as before, it means orig_key is
2392
* the lowest, leftmost, key in the tree. We're done.
2393
*/
2394
return 1;
2395
}
2396
}
2397
2398
btrfs_item_key(path->nodes[0], &found_key, 0);
2399
ret = btrfs_comp_keys(&found_key, &key);
2400
/*
2401
* We might have had an item with the previous key in the tree right
2402
* before we released our path. And after we released our path, that
2403
* item might have been pushed to the first slot (0) of the leaf we
2404
* were holding due to a tree balance. Alternatively, an item with the
2405
* previous key can exist as the only element of a leaf (big fat item).
2406
* Therefore account for these 2 cases, so that our callers (like
2407
* btrfs_previous_item) don't miss an existing item with a key matching
2408
* the previous key we computed above.
2409
*/
2410
if (ret <= 0)
2411
return 0;
2412
return 1;
2413
}
2414
2415
/*
2416
* helper to use instead of search slot if no exact match is needed but
2417
* instead the next or previous item should be returned.
2418
* When find_higher is true, the next higher item is returned, the next lower
2419
* otherwise.
2420
* When return_any and find_higher are both true, and no higher item is found,
2421
* return the next lower instead.
2422
* When return_any is true and find_higher is false, and no lower item is found,
2423
* return the next higher instead.
2424
* It returns 0 if any item is found, 1 if none is found (tree empty), and
2425
* < 0 on error
2426
*/
2427
int btrfs_search_slot_for_read(struct btrfs_root *root,
2428
const struct btrfs_key *key,
2429
struct btrfs_path *p, int find_higher,
2430
int return_any)
2431
{
2432
int ret;
2433
struct extent_buffer *leaf;
2434
2435
again:
2436
ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2437
if (ret <= 0)
2438
return ret;
2439
/*
2440
* a return value of 1 means the path is at the position where the
2441
* item should be inserted. Normally this is the next bigger item,
2442
* but in case the previous item is the last in a leaf, path points
2443
* to the first free slot in the previous leaf, i.e. at an invalid
2444
* item.
2445
*/
2446
leaf = p->nodes[0];
2447
2448
if (find_higher) {
2449
if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2450
ret = btrfs_next_leaf(root, p);
2451
if (ret <= 0)
2452
return ret;
2453
if (!return_any)
2454
return 1;
2455
/*
2456
* no higher item found, return the next
2457
* lower instead
2458
*/
2459
return_any = 0;
2460
find_higher = 0;
2461
btrfs_release_path(p);
2462
goto again;
2463
}
2464
} else {
2465
if (p->slots[0] == 0) {
2466
ret = btrfs_prev_leaf(root, p);
2467
if (ret < 0)
2468
return ret;
2469
if (!ret) {
2470
leaf = p->nodes[0];
2471
if (p->slots[0] == btrfs_header_nritems(leaf))
2472
p->slots[0]--;
2473
return 0;
2474
}
2475
if (!return_any)
2476
return 1;
2477
/*
2478
* no lower item found, return the next
2479
* higher instead
2480
*/
2481
return_any = 0;
2482
find_higher = 1;
2483
btrfs_release_path(p);
2484
goto again;
2485
} else {
2486
--p->slots[0];
2487
}
2488
}
2489
return 0;
2490
}
2491
2492
/*
2493
* Execute search and call btrfs_previous_item to traverse backwards if the item
2494
* was not found.
2495
*
2496
* Return 0 if found, 1 if not found and < 0 if error.
2497
*/
2498
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2499
struct btrfs_path *path)
2500
{
2501
int ret;
2502
2503
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2504
if (ret > 0)
2505
ret = btrfs_previous_item(root, path, key->objectid, key->type);
2506
2507
if (ret == 0)
2508
btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2509
2510
return ret;
2511
}
2512
2513
/*
2514
* Search for a valid slot for the given path.
2515
*
2516
* @root: The root node of the tree.
2517
* @key: Will contain a valid item if found.
2518
* @path: The starting point to validate the slot.
2519
*
2520
* Return: 0 if the item is valid
2521
* 1 if not found
2522
* <0 if error.
2523
*/
2524
int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2525
struct btrfs_path *path)
2526
{
2527
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2528
int ret;
2529
2530
ret = btrfs_next_leaf(root, path);
2531
if (ret)
2532
return ret;
2533
}
2534
2535
btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2536
return 0;
2537
}
2538
2539
/*
2540
* adjust the pointers going up the tree, starting at level
2541
* making sure the right key of each node is points to 'key'.
2542
* This is used after shifting pointers to the left, so it stops
2543
* fixing up pointers when a given leaf/node is not in slot 0 of the
2544
* higher levels
2545
*
2546
*/
2547
static void fixup_low_keys(struct btrfs_trans_handle *trans,
2548
const struct btrfs_path *path,
2549
const struct btrfs_disk_key *key, int level)
2550
{
2551
int i;
2552
struct extent_buffer *t;
2553
int ret;
2554
2555
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2556
int tslot = path->slots[i];
2557
2558
if (!path->nodes[i])
2559
break;
2560
t = path->nodes[i];
2561
ret = btrfs_tree_mod_log_insert_key(t, tslot,
2562
BTRFS_MOD_LOG_KEY_REPLACE);
2563
BUG_ON(ret < 0);
2564
btrfs_set_node_key(t, key, tslot);
2565
btrfs_mark_buffer_dirty(trans, path->nodes[i]);
2566
if (tslot != 0)
2567
break;
2568
}
2569
}
2570
2571
/*
2572
* update item key.
2573
*
2574
* This function isn't completely safe. It's the caller's responsibility
2575
* that the new key won't break the order
2576
*/
2577
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2578
const struct btrfs_path *path,
2579
const struct btrfs_key *new_key)
2580
{
2581
struct btrfs_fs_info *fs_info = trans->fs_info;
2582
struct btrfs_disk_key disk_key;
2583
struct extent_buffer *eb;
2584
int slot;
2585
2586
eb = path->nodes[0];
2587
slot = path->slots[0];
2588
if (slot > 0) {
2589
btrfs_item_key(eb, &disk_key, slot - 1);
2590
if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
2591
btrfs_print_leaf(eb);
2592
btrfs_crit(fs_info,
2593
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2594
slot, btrfs_disk_key_objectid(&disk_key),
2595
btrfs_disk_key_type(&disk_key),
2596
btrfs_disk_key_offset(&disk_key),
2597
new_key->objectid, new_key->type,
2598
new_key->offset);
2599
BUG();
2600
}
2601
}
2602
if (slot < btrfs_header_nritems(eb) - 1) {
2603
btrfs_item_key(eb, &disk_key, slot + 1);
2604
if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
2605
btrfs_print_leaf(eb);
2606
btrfs_crit(fs_info,
2607
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2608
slot, btrfs_disk_key_objectid(&disk_key),
2609
btrfs_disk_key_type(&disk_key),
2610
btrfs_disk_key_offset(&disk_key),
2611
new_key->objectid, new_key->type,
2612
new_key->offset);
2613
BUG();
2614
}
2615
}
2616
2617
btrfs_cpu_key_to_disk(&disk_key, new_key);
2618
btrfs_set_item_key(eb, &disk_key, slot);
2619
btrfs_mark_buffer_dirty(trans, eb);
2620
if (slot == 0)
2621
fixup_low_keys(trans, path, &disk_key, 1);
2622
}
2623
2624
/*
2625
* Check key order of two sibling extent buffers.
2626
*
2627
* Return true if something is wrong.
2628
* Return false if everything is fine.
2629
*
2630
* Tree-checker only works inside one tree block, thus the following
2631
* corruption can not be detected by tree-checker:
2632
*
2633
* Leaf @left | Leaf @right
2634
* --------------------------------------------------------------
2635
* | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2636
*
2637
* Key f6 in leaf @left itself is valid, but not valid when the next
2638
* key in leaf @right is 7.
2639
* This can only be checked at tree block merge time.
2640
* And since tree checker has ensured all key order in each tree block
2641
* is correct, we only need to bother the last key of @left and the first
2642
* key of @right.
2643
*/
2644
static bool check_sibling_keys(const struct extent_buffer *left,
2645
const struct extent_buffer *right)
2646
{
2647
struct btrfs_key left_last;
2648
struct btrfs_key right_first;
2649
int level = btrfs_header_level(left);
2650
int nr_left = btrfs_header_nritems(left);
2651
int nr_right = btrfs_header_nritems(right);
2652
2653
/* No key to check in one of the tree blocks */
2654
if (!nr_left || !nr_right)
2655
return false;
2656
2657
if (level) {
2658
btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2659
btrfs_node_key_to_cpu(right, &right_first, 0);
2660
} else {
2661
btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2662
btrfs_item_key_to_cpu(right, &right_first, 0);
2663
}
2664
2665
if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2666
btrfs_crit(left->fs_info, "left extent buffer:");
2667
btrfs_print_tree(left, false);
2668
btrfs_crit(left->fs_info, "right extent buffer:");
2669
btrfs_print_tree(right, false);
2670
btrfs_crit(left->fs_info,
2671
"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2672
left_last.objectid, left_last.type,
2673
left_last.offset, right_first.objectid,
2674
right_first.type, right_first.offset);
2675
return true;
2676
}
2677
return false;
2678
}
2679
2680
/*
2681
* try to push data from one node into the next node left in the
2682
* tree.
2683
*
2684
* returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2685
* error, and > 0 if there was no room in the left hand block.
2686
*/
2687
static int push_node_left(struct btrfs_trans_handle *trans,
2688
struct extent_buffer *dst,
2689
struct extent_buffer *src, int empty)
2690
{
2691
struct btrfs_fs_info *fs_info = trans->fs_info;
2692
int push_items = 0;
2693
int src_nritems;
2694
int dst_nritems;
2695
int ret = 0;
2696
2697
src_nritems = btrfs_header_nritems(src);
2698
dst_nritems = btrfs_header_nritems(dst);
2699
push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2700
WARN_ON(btrfs_header_generation(src) != trans->transid);
2701
WARN_ON(btrfs_header_generation(dst) != trans->transid);
2702
2703
if (!empty && src_nritems <= 8)
2704
return 1;
2705
2706
if (push_items <= 0)
2707
return 1;
2708
2709
if (empty) {
2710
push_items = min(src_nritems, push_items);
2711
if (push_items < src_nritems) {
2712
/* leave at least 8 pointers in the node if
2713
* we aren't going to empty it
2714
*/
2715
if (src_nritems - push_items < 8) {
2716
if (push_items <= 8)
2717
return 1;
2718
push_items -= 8;
2719
}
2720
}
2721
} else
2722
push_items = min(src_nritems - 8, push_items);
2723
2724
/* dst is the left eb, src is the middle eb */
2725
if (check_sibling_keys(dst, src)) {
2726
ret = -EUCLEAN;
2727
btrfs_abort_transaction(trans, ret);
2728
return ret;
2729
}
2730
ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2731
if (ret) {
2732
btrfs_abort_transaction(trans, ret);
2733
return ret;
2734
}
2735
copy_extent_buffer(dst, src,
2736
btrfs_node_key_ptr_offset(dst, dst_nritems),
2737
btrfs_node_key_ptr_offset(src, 0),
2738
push_items * sizeof(struct btrfs_key_ptr));
2739
2740
if (push_items < src_nritems) {
2741
/*
2742
* btrfs_tree_mod_log_eb_copy handles logging the move, so we
2743
* don't need to do an explicit tree mod log operation for it.
2744
*/
2745
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2746
btrfs_node_key_ptr_offset(src, push_items),
2747
(src_nritems - push_items) *
2748
sizeof(struct btrfs_key_ptr));
2749
}
2750
btrfs_set_header_nritems(src, src_nritems - push_items);
2751
btrfs_set_header_nritems(dst, dst_nritems + push_items);
2752
btrfs_mark_buffer_dirty(trans, src);
2753
btrfs_mark_buffer_dirty(trans, dst);
2754
2755
return ret;
2756
}
2757
2758
/*
2759
* try to push data from one node into the next node right in the
2760
* tree.
2761
*
2762
* returns 0 if some ptrs were pushed, < 0 if there was some horrible
2763
* error, and > 0 if there was no room in the right hand block.
2764
*
2765
* this will only push up to 1/2 the contents of the left node over
2766
*/
2767
static int balance_node_right(struct btrfs_trans_handle *trans,
2768
struct extent_buffer *dst,
2769
struct extent_buffer *src)
2770
{
2771
struct btrfs_fs_info *fs_info = trans->fs_info;
2772
int push_items = 0;
2773
int max_push;
2774
int src_nritems;
2775
int dst_nritems;
2776
int ret = 0;
2777
2778
WARN_ON(btrfs_header_generation(src) != trans->transid);
2779
WARN_ON(btrfs_header_generation(dst) != trans->transid);
2780
2781
src_nritems = btrfs_header_nritems(src);
2782
dst_nritems = btrfs_header_nritems(dst);
2783
push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2784
if (push_items <= 0)
2785
return 1;
2786
2787
if (src_nritems < 4)
2788
return 1;
2789
2790
max_push = src_nritems / 2 + 1;
2791
/* don't try to empty the node */
2792
if (max_push >= src_nritems)
2793
return 1;
2794
2795
if (max_push < push_items)
2796
push_items = max_push;
2797
2798
/* dst is the right eb, src is the middle eb */
2799
if (check_sibling_keys(src, dst)) {
2800
ret = -EUCLEAN;
2801
btrfs_abort_transaction(trans, ret);
2802
return ret;
2803
}
2804
2805
/*
2806
* btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2807
* need to do an explicit tree mod log operation for it.
2808
*/
2809
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2810
btrfs_node_key_ptr_offset(dst, 0),
2811
(dst_nritems) *
2812
sizeof(struct btrfs_key_ptr));
2813
2814
ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2815
push_items);
2816
if (ret) {
2817
btrfs_abort_transaction(trans, ret);
2818
return ret;
2819
}
2820
copy_extent_buffer(dst, src,
2821
btrfs_node_key_ptr_offset(dst, 0),
2822
btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2823
push_items * sizeof(struct btrfs_key_ptr));
2824
2825
btrfs_set_header_nritems(src, src_nritems - push_items);
2826
btrfs_set_header_nritems(dst, dst_nritems + push_items);
2827
2828
btrfs_mark_buffer_dirty(trans, src);
2829
btrfs_mark_buffer_dirty(trans, dst);
2830
2831
return ret;
2832
}
2833
2834
/*
2835
* helper function to insert a new root level in the tree.
2836
* A new node is allocated, and a single item is inserted to
2837
* point to the existing root
2838
*
2839
* returns zero on success or < 0 on failure.
2840
*/
2841
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2842
struct btrfs_root *root,
2843
struct btrfs_path *path, int level)
2844
{
2845
u64 lower_gen;
2846
struct extent_buffer *lower;
2847
struct extent_buffer *c;
2848
struct extent_buffer *old;
2849
struct btrfs_disk_key lower_key;
2850
int ret;
2851
2852
BUG_ON(path->nodes[level]);
2853
BUG_ON(path->nodes[level-1] != root->node);
2854
2855
lower = path->nodes[level-1];
2856
if (level == 1)
2857
btrfs_item_key(lower, &lower_key, 0);
2858
else
2859
btrfs_node_key(lower, &lower_key, 0);
2860
2861
c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
2862
&lower_key, level, root->node->start, 0,
2863
0, BTRFS_NESTING_NEW_ROOT);
2864
if (IS_ERR(c))
2865
return PTR_ERR(c);
2866
2867
root_add_used_bytes(root);
2868
2869
btrfs_set_header_nritems(c, 1);
2870
btrfs_set_node_key(c, &lower_key, 0);
2871
btrfs_set_node_blockptr(c, 0, lower->start);
2872
lower_gen = btrfs_header_generation(lower);
2873
WARN_ON(lower_gen != trans->transid);
2874
2875
btrfs_set_node_ptr_generation(c, 0, lower_gen);
2876
2877
btrfs_mark_buffer_dirty(trans, c);
2878
2879
old = root->node;
2880
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2881
if (ret < 0) {
2882
int ret2;
2883
2884
btrfs_clear_buffer_dirty(trans, c);
2885
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
2886
if (ret2 < 0)
2887
btrfs_abort_transaction(trans, ret2);
2888
btrfs_tree_unlock(c);
2889
free_extent_buffer(c);
2890
return ret;
2891
}
2892
rcu_assign_pointer(root->node, c);
2893
2894
/* the super has an extra ref to root->node */
2895
free_extent_buffer(old);
2896
2897
add_root_to_dirty_list(root);
2898
refcount_inc(&c->refs);
2899
path->nodes[level] = c;
2900
path->locks[level] = BTRFS_WRITE_LOCK;
2901
path->slots[level] = 0;
2902
return 0;
2903
}
2904
2905
/*
2906
* worker function to insert a single pointer in a node.
2907
* the node should have enough room for the pointer already
2908
*
2909
* slot and level indicate where you want the key to go, and
2910
* blocknr is the block the key points to.
2911
*/
2912
static int insert_ptr(struct btrfs_trans_handle *trans,
2913
const struct btrfs_path *path,
2914
const struct btrfs_disk_key *key, u64 bytenr,
2915
int slot, int level)
2916
{
2917
struct extent_buffer *lower;
2918
int nritems;
2919
int ret;
2920
2921
BUG_ON(!path->nodes[level]);
2922
btrfs_assert_tree_write_locked(path->nodes[level]);
2923
lower = path->nodes[level];
2924
nritems = btrfs_header_nritems(lower);
2925
BUG_ON(slot > nritems);
2926
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2927
if (slot != nritems) {
2928
if (level) {
2929
ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2930
slot, nritems - slot);
2931
if (ret < 0) {
2932
btrfs_abort_transaction(trans, ret);
2933
return ret;
2934
}
2935
}
2936
memmove_extent_buffer(lower,
2937
btrfs_node_key_ptr_offset(lower, slot + 1),
2938
btrfs_node_key_ptr_offset(lower, slot),
2939
(nritems - slot) * sizeof(struct btrfs_key_ptr));
2940
}
2941
if (level) {
2942
ret = btrfs_tree_mod_log_insert_key(lower, slot,
2943
BTRFS_MOD_LOG_KEY_ADD);
2944
if (ret < 0) {
2945
btrfs_abort_transaction(trans, ret);
2946
return ret;
2947
}
2948
}
2949
btrfs_set_node_key(lower, key, slot);
2950
btrfs_set_node_blockptr(lower, slot, bytenr);
2951
WARN_ON(trans->transid == 0);
2952
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2953
btrfs_set_header_nritems(lower, nritems + 1);
2954
btrfs_mark_buffer_dirty(trans, lower);
2955
2956
return 0;
2957
}
2958
2959
/*
2960
* split the node at the specified level in path in two.
2961
* The path is corrected to point to the appropriate node after the split
2962
*
2963
* Before splitting this tries to make some room in the node by pushing
2964
* left and right, if either one works, it returns right away.
2965
*
2966
* returns 0 on success and < 0 on failure
2967
*/
2968
static noinline int split_node(struct btrfs_trans_handle *trans,
2969
struct btrfs_root *root,
2970
struct btrfs_path *path, int level)
2971
{
2972
struct btrfs_fs_info *fs_info = root->fs_info;
2973
struct extent_buffer *c;
2974
struct extent_buffer *split;
2975
struct btrfs_disk_key disk_key;
2976
int mid;
2977
int ret;
2978
u32 c_nritems;
2979
2980
c = path->nodes[level];
2981
WARN_ON(btrfs_header_generation(c) != trans->transid);
2982
if (c == root->node) {
2983
/*
2984
* trying to split the root, lets make a new one
2985
*
2986
* tree mod log: We don't log_removal old root in
2987
* insert_new_root, because that root buffer will be kept as a
2988
* normal node. We are going to log removal of half of the
2989
* elements below with btrfs_tree_mod_log_eb_copy(). We're
2990
* holding a tree lock on the buffer, which is why we cannot
2991
* race with other tree_mod_log users.
2992
*/
2993
ret = insert_new_root(trans, root, path, level + 1);
2994
if (ret)
2995
return ret;
2996
} else {
2997
ret = push_nodes_for_insert(trans, root, path, level);
2998
c = path->nodes[level];
2999
if (!ret && btrfs_header_nritems(c) <
3000
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3001
return 0;
3002
if (ret < 0)
3003
return ret;
3004
}
3005
3006
c_nritems = btrfs_header_nritems(c);
3007
mid = (c_nritems + 1) / 2;
3008
btrfs_node_key(c, &disk_key, mid);
3009
3010
split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
3011
&disk_key, level, c->start, 0,
3012
0, BTRFS_NESTING_SPLIT);
3013
if (IS_ERR(split))
3014
return PTR_ERR(split);
3015
3016
root_add_used_bytes(root);
3017
ASSERT(btrfs_header_level(c) == level);
3018
3019
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3020
if (ret) {
3021
btrfs_tree_unlock(split);
3022
free_extent_buffer(split);
3023
btrfs_abort_transaction(trans, ret);
3024
return ret;
3025
}
3026
copy_extent_buffer(split, c,
3027
btrfs_node_key_ptr_offset(split, 0),
3028
btrfs_node_key_ptr_offset(c, mid),
3029
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3030
btrfs_set_header_nritems(split, c_nritems - mid);
3031
btrfs_set_header_nritems(c, mid);
3032
3033
btrfs_mark_buffer_dirty(trans, c);
3034
btrfs_mark_buffer_dirty(trans, split);
3035
3036
ret = insert_ptr(trans, path, &disk_key, split->start,
3037
path->slots[level + 1] + 1, level + 1);
3038
if (ret < 0) {
3039
btrfs_tree_unlock(split);
3040
free_extent_buffer(split);
3041
return ret;
3042
}
3043
3044
if (path->slots[level] >= mid) {
3045
path->slots[level] -= mid;
3046
btrfs_tree_unlock(c);
3047
free_extent_buffer(c);
3048
path->nodes[level] = split;
3049
path->slots[level + 1] += 1;
3050
} else {
3051
btrfs_tree_unlock(split);
3052
free_extent_buffer(split);
3053
}
3054
return 0;
3055
}
3056
3057
/*
3058
* how many bytes are required to store the items in a leaf. start
3059
* and nr indicate which items in the leaf to check. This totals up the
3060
* space used both by the item structs and the item data
3061
*/
3062
static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3063
{
3064
int data_len;
3065
int nritems = btrfs_header_nritems(l);
3066
int end = min(nritems, start + nr) - 1;
3067
3068
if (!nr)
3069
return 0;
3070
data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3071
data_len = data_len - btrfs_item_offset(l, end);
3072
data_len += sizeof(struct btrfs_item) * nr;
3073
WARN_ON(data_len < 0);
3074
return data_len;
3075
}
3076
3077
/*
3078
* The space between the end of the leaf items and
3079
* the start of the leaf data. IOW, how much room
3080
* the leaf has left for both items and data
3081
*/
3082
int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3083
{
3084
struct btrfs_fs_info *fs_info = leaf->fs_info;
3085
int nritems = btrfs_header_nritems(leaf);
3086
int ret;
3087
3088
ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3089
if (ret < 0) {
3090
btrfs_crit(fs_info,
3091
"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3092
ret,
3093
(unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3094
leaf_space_used(leaf, 0, nritems), nritems);
3095
}
3096
return ret;
3097
}
3098
3099
/*
3100
* min slot controls the lowest index we're willing to push to the
3101
* right. We'll push up to and including min_slot, but no lower
3102
*/
3103
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3104
struct btrfs_path *path,
3105
int data_size, int empty,
3106
struct extent_buffer *right,
3107
int free_space, u32 left_nritems,
3108
u32 min_slot)
3109
{
3110
struct btrfs_fs_info *fs_info = right->fs_info;
3111
struct extent_buffer *left = path->nodes[0];
3112
struct extent_buffer *upper = path->nodes[1];
3113
struct btrfs_disk_key disk_key;
3114
int slot;
3115
u32 i;
3116
int push_space = 0;
3117
int push_items = 0;
3118
u32 nr;
3119
u32 right_nritems;
3120
u32 data_end;
3121
u32 this_item_size;
3122
3123
if (empty)
3124
nr = 0;
3125
else
3126
nr = max_t(u32, 1, min_slot);
3127
3128
if (path->slots[0] >= left_nritems)
3129
push_space += data_size;
3130
3131
slot = path->slots[1];
3132
i = left_nritems - 1;
3133
while (i >= nr) {
3134
if (!empty && push_items > 0) {
3135
if (path->slots[0] > i)
3136
break;
3137
if (path->slots[0] == i) {
3138
int space = btrfs_leaf_free_space(left);
3139
3140
if (space + push_space * 2 > free_space)
3141
break;
3142
}
3143
}
3144
3145
if (path->slots[0] == i)
3146
push_space += data_size;
3147
3148
this_item_size = btrfs_item_size(left, i);
3149
if (this_item_size + sizeof(struct btrfs_item) +
3150
push_space > free_space)
3151
break;
3152
3153
push_items++;
3154
push_space += this_item_size + sizeof(struct btrfs_item);
3155
if (i == 0)
3156
break;
3157
i--;
3158
}
3159
3160
if (push_items == 0)
3161
goto out_unlock;
3162
3163
WARN_ON(!empty && push_items == left_nritems);
3164
3165
/* push left to right */
3166
right_nritems = btrfs_header_nritems(right);
3167
3168
push_space = btrfs_item_data_end(left, left_nritems - push_items);
3169
push_space -= leaf_data_end(left);
3170
3171
/* make room in the right data area */
3172
data_end = leaf_data_end(right);
3173
memmove_leaf_data(right, data_end - push_space, data_end,
3174
BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3175
3176
/* copy from the left data area */
3177
copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3178
leaf_data_end(left), push_space);
3179
3180
memmove_leaf_items(right, push_items, 0, right_nritems);
3181
3182
/* copy the items from left to right */
3183
copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3184
3185
/* update the item pointers */
3186
right_nritems += push_items;
3187
btrfs_set_header_nritems(right, right_nritems);
3188
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3189
for (i = 0; i < right_nritems; i++) {
3190
push_space -= btrfs_item_size(right, i);
3191
btrfs_set_item_offset(right, i, push_space);
3192
}
3193
3194
left_nritems -= push_items;
3195
btrfs_set_header_nritems(left, left_nritems);
3196
3197
if (left_nritems)
3198
btrfs_mark_buffer_dirty(trans, left);
3199
else
3200
btrfs_clear_buffer_dirty(trans, left);
3201
3202
btrfs_mark_buffer_dirty(trans, right);
3203
3204
btrfs_item_key(right, &disk_key, 0);
3205
btrfs_set_node_key(upper, &disk_key, slot + 1);
3206
btrfs_mark_buffer_dirty(trans, upper);
3207
3208
/* then fixup the leaf pointer in the path */
3209
if (path->slots[0] >= left_nritems) {
3210
path->slots[0] -= left_nritems;
3211
if (btrfs_header_nritems(path->nodes[0]) == 0)
3212
btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3213
btrfs_tree_unlock(path->nodes[0]);
3214
free_extent_buffer(path->nodes[0]);
3215
path->nodes[0] = right;
3216
path->slots[1] += 1;
3217
} else {
3218
btrfs_tree_unlock(right);
3219
free_extent_buffer(right);
3220
}
3221
return 0;
3222
3223
out_unlock:
3224
btrfs_tree_unlock(right);
3225
free_extent_buffer(right);
3226
return 1;
3227
}
3228
3229
/*
3230
* push some data in the path leaf to the right, trying to free up at
3231
* least data_size bytes. returns zero if the push worked, nonzero otherwise
3232
*
3233
* returns 1 if the push failed because the other node didn't have enough
3234
* room, 0 if everything worked out and < 0 if there were major errors.
3235
*
3236
* this will push starting from min_slot to the end of the leaf. It won't
3237
* push any slot lower than min_slot
3238
*/
3239
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3240
*root, struct btrfs_path *path,
3241
int min_data_size, int data_size,
3242
int empty, u32 min_slot)
3243
{
3244
struct extent_buffer *left = path->nodes[0];
3245
struct extent_buffer *right;
3246
struct extent_buffer *upper;
3247
int slot;
3248
int free_space;
3249
u32 left_nritems;
3250
int ret;
3251
3252
if (!path->nodes[1])
3253
return 1;
3254
3255
slot = path->slots[1];
3256
upper = path->nodes[1];
3257
if (slot >= btrfs_header_nritems(upper) - 1)
3258
return 1;
3259
3260
btrfs_assert_tree_write_locked(path->nodes[1]);
3261
3262
right = btrfs_read_node_slot(upper, slot + 1);
3263
if (IS_ERR(right))
3264
return PTR_ERR(right);
3265
3266
btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
3267
3268
free_space = btrfs_leaf_free_space(right);
3269
if (free_space < data_size)
3270
goto out_unlock;
3271
3272
ret = btrfs_cow_block(trans, root, right, upper,
3273
slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3274
if (ret)
3275
goto out_unlock;
3276
3277
left_nritems = btrfs_header_nritems(left);
3278
if (left_nritems == 0)
3279
goto out_unlock;
3280
3281
if (check_sibling_keys(left, right)) {
3282
ret = -EUCLEAN;
3283
btrfs_abort_transaction(trans, ret);
3284
btrfs_tree_unlock(right);
3285
free_extent_buffer(right);
3286
return ret;
3287
}
3288
if (path->slots[0] == left_nritems && !empty) {
3289
/* Key greater than all keys in the leaf, right neighbor has
3290
* enough room for it and we're not emptying our leaf to delete
3291
* it, therefore use right neighbor to insert the new item and
3292
* no need to touch/dirty our left leaf. */
3293
btrfs_tree_unlock(left);
3294
free_extent_buffer(left);
3295
path->nodes[0] = right;
3296
path->slots[0] = 0;
3297
path->slots[1]++;
3298
return 0;
3299
}
3300
3301
return __push_leaf_right(trans, path, min_data_size, empty, right,
3302
free_space, left_nritems, min_slot);
3303
out_unlock:
3304
btrfs_tree_unlock(right);
3305
free_extent_buffer(right);
3306
return 1;
3307
}
3308
3309
/*
3310
* push some data in the path leaf to the left, trying to free up at
3311
* least data_size bytes. returns zero if the push worked, nonzero otherwise
3312
*
3313
* max_slot can put a limit on how far into the leaf we'll push items. The
3314
* item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3315
* items
3316
*/
3317
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3318
struct btrfs_path *path, int data_size,
3319
int empty, struct extent_buffer *left,
3320
int free_space, u32 right_nritems,
3321
u32 max_slot)
3322
{
3323
struct btrfs_fs_info *fs_info = left->fs_info;
3324
struct btrfs_disk_key disk_key;
3325
struct extent_buffer *right = path->nodes[0];
3326
int i;
3327
int push_space = 0;
3328
int push_items = 0;
3329
u32 old_left_nritems;
3330
u32 nr;
3331
int ret = 0;
3332
u32 this_item_size;
3333
u32 old_left_item_size;
3334
3335
if (empty)
3336
nr = min(right_nritems, max_slot);
3337
else
3338
nr = min(right_nritems - 1, max_slot);
3339
3340
for (i = 0; i < nr; i++) {
3341
if (!empty && push_items > 0) {
3342
if (path->slots[0] < i)
3343
break;
3344
if (path->slots[0] == i) {
3345
int space = btrfs_leaf_free_space(right);
3346
3347
if (space + push_space * 2 > free_space)
3348
break;
3349
}
3350
}
3351
3352
if (path->slots[0] == i)
3353
push_space += data_size;
3354
3355
this_item_size = btrfs_item_size(right, i);
3356
if (this_item_size + sizeof(struct btrfs_item) + push_space >
3357
free_space)
3358
break;
3359
3360
push_items++;
3361
push_space += this_item_size + sizeof(struct btrfs_item);
3362
}
3363
3364
if (push_items == 0) {
3365
ret = 1;
3366
goto out;
3367
}
3368
WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3369
3370
/* push data from right to left */
3371
copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3372
3373
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3374
btrfs_item_offset(right, push_items - 1);
3375
3376
copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3377
btrfs_item_offset(right, push_items - 1), push_space);
3378
old_left_nritems = btrfs_header_nritems(left);
3379
BUG_ON(old_left_nritems <= 0);
3380
3381
old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3382
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3383
u32 ioff;
3384
3385
ioff = btrfs_item_offset(left, i);
3386
btrfs_set_item_offset(left, i,
3387
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3388
}
3389
btrfs_set_header_nritems(left, old_left_nritems + push_items);
3390
3391
/* fixup right node */
3392
if (push_items > right_nritems)
3393
WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3394
right_nritems);
3395
3396
if (push_items < right_nritems) {
3397
push_space = btrfs_item_offset(right, push_items - 1) -
3398
leaf_data_end(right);
3399
memmove_leaf_data(right,
3400
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3401
leaf_data_end(right), push_space);
3402
3403
memmove_leaf_items(right, 0, push_items,
3404
btrfs_header_nritems(right) - push_items);
3405
}
3406
3407
right_nritems -= push_items;
3408
btrfs_set_header_nritems(right, right_nritems);
3409
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3410
for (i = 0; i < right_nritems; i++) {
3411
push_space = push_space - btrfs_item_size(right, i);
3412
btrfs_set_item_offset(right, i, push_space);
3413
}
3414
3415
btrfs_mark_buffer_dirty(trans, left);
3416
if (right_nritems)
3417
btrfs_mark_buffer_dirty(trans, right);
3418
else
3419
btrfs_clear_buffer_dirty(trans, right);
3420
3421
btrfs_item_key(right, &disk_key, 0);
3422
fixup_low_keys(trans, path, &disk_key, 1);
3423
3424
/* then fixup the leaf pointer in the path */
3425
if (path->slots[0] < push_items) {
3426
path->slots[0] += old_left_nritems;
3427
btrfs_tree_unlock(path->nodes[0]);
3428
free_extent_buffer(path->nodes[0]);
3429
path->nodes[0] = left;
3430
path->slots[1] -= 1;
3431
} else {
3432
btrfs_tree_unlock(left);
3433
free_extent_buffer(left);
3434
path->slots[0] -= push_items;
3435
}
3436
BUG_ON(path->slots[0] < 0);
3437
return ret;
3438
out:
3439
btrfs_tree_unlock(left);
3440
free_extent_buffer(left);
3441
return ret;
3442
}
3443
3444
/*
3445
* push some data in the path leaf to the left, trying to free up at
3446
* least data_size bytes. returns zero if the push worked, nonzero otherwise
3447
*
3448
* max_slot can put a limit on how far into the leaf we'll push items. The
3449
* item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3450
* items
3451
*/
3452
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3453
*root, struct btrfs_path *path, int min_data_size,
3454
int data_size, int empty, u32 max_slot)
3455
{
3456
struct extent_buffer *right = path->nodes[0];
3457
struct extent_buffer *left;
3458
int slot;
3459
int free_space;
3460
u32 right_nritems;
3461
int ret = 0;
3462
3463
slot = path->slots[1];
3464
if (slot == 0)
3465
return 1;
3466
if (!path->nodes[1])
3467
return 1;
3468
3469
right_nritems = btrfs_header_nritems(right);
3470
if (right_nritems == 0)
3471
return 1;
3472
3473
btrfs_assert_tree_write_locked(path->nodes[1]);
3474
3475
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3476
if (IS_ERR(left))
3477
return PTR_ERR(left);
3478
3479
btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
3480
3481
free_space = btrfs_leaf_free_space(left);
3482
if (free_space < data_size) {
3483
ret = 1;
3484
goto out;
3485
}
3486
3487
ret = btrfs_cow_block(trans, root, left,
3488
path->nodes[1], slot - 1, &left,
3489
BTRFS_NESTING_LEFT_COW);
3490
if (ret) {
3491
/* we hit -ENOSPC, but it isn't fatal here */
3492
if (ret == -ENOSPC)
3493
ret = 1;
3494
goto out;
3495
}
3496
3497
if (check_sibling_keys(left, right)) {
3498
ret = -EUCLEAN;
3499
btrfs_abort_transaction(trans, ret);
3500
goto out;
3501
}
3502
return __push_leaf_left(trans, path, min_data_size, empty, left,
3503
free_space, right_nritems, max_slot);
3504
out:
3505
btrfs_tree_unlock(left);
3506
free_extent_buffer(left);
3507
return ret;
3508
}
3509
3510
/*
3511
* split the path's leaf in two, making sure there is at least data_size
3512
* available for the resulting leaf level of the path.
3513
*/
3514
static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3515
struct btrfs_path *path,
3516
struct extent_buffer *l,
3517
struct extent_buffer *right,
3518
int slot, int mid, int nritems)
3519
{
3520
struct btrfs_fs_info *fs_info = trans->fs_info;
3521
int data_copy_size;
3522
int rt_data_off;
3523
int i;
3524
int ret;
3525
struct btrfs_disk_key disk_key;
3526
3527
nritems = nritems - mid;
3528
btrfs_set_header_nritems(right, nritems);
3529
data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3530
3531
copy_leaf_items(right, l, 0, mid, nritems);
3532
3533
copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3534
leaf_data_end(l), data_copy_size);
3535
3536
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3537
3538
for (i = 0; i < nritems; i++) {
3539
u32 ioff;
3540
3541
ioff = btrfs_item_offset(right, i);
3542
btrfs_set_item_offset(right, i, ioff + rt_data_off);
3543
}
3544
3545
btrfs_set_header_nritems(l, mid);
3546
btrfs_item_key(right, &disk_key, 0);
3547
ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3548
if (ret < 0)
3549
return ret;
3550
3551
btrfs_mark_buffer_dirty(trans, right);
3552
btrfs_mark_buffer_dirty(trans, l);
3553
BUG_ON(path->slots[0] != slot);
3554
3555
if (mid <= slot) {
3556
btrfs_tree_unlock(path->nodes[0]);
3557
free_extent_buffer(path->nodes[0]);
3558
path->nodes[0] = right;
3559
path->slots[0] -= mid;
3560
path->slots[1] += 1;
3561
} else {
3562
btrfs_tree_unlock(right);
3563
free_extent_buffer(right);
3564
}
3565
3566
BUG_ON(path->slots[0] < 0);
3567
3568
return 0;
3569
}
3570
3571
/*
3572
* double splits happen when we need to insert a big item in the middle
3573
* of a leaf. A double split can leave us with 3 mostly empty leaves:
3574
* leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3575
* A B C
3576
*
3577
* We avoid this by trying to push the items on either side of our target
3578
* into the adjacent leaves. If all goes well we can avoid the double split
3579
* completely.
3580
*/
3581
static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3582
struct btrfs_root *root,
3583
struct btrfs_path *path,
3584
int data_size)
3585
{
3586
int ret;
3587
int progress = 0;
3588
int slot;
3589
u32 nritems;
3590
int space_needed = data_size;
3591
3592
slot = path->slots[0];
3593
if (slot < btrfs_header_nritems(path->nodes[0]))
3594
space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3595
3596
/*
3597
* try to push all the items after our slot into the
3598
* right leaf
3599
*/
3600
ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3601
if (ret < 0)
3602
return ret;
3603
3604
if (ret == 0)
3605
progress++;
3606
3607
nritems = btrfs_header_nritems(path->nodes[0]);
3608
/*
3609
* our goal is to get our slot at the start or end of a leaf. If
3610
* we've done so we're done
3611
*/
3612
if (path->slots[0] == 0 || path->slots[0] == nritems)
3613
return 0;
3614
3615
if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3616
return 0;
3617
3618
/* try to push all the items before our slot into the next leaf */
3619
slot = path->slots[0];
3620
space_needed = data_size;
3621
if (slot > 0)
3622
space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3623
ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3624
if (ret < 0)
3625
return ret;
3626
3627
if (ret == 0)
3628
progress++;
3629
3630
if (progress)
3631
return 0;
3632
return 1;
3633
}
3634
3635
/*
3636
* split the path's leaf in two, making sure there is at least data_size
3637
* available for the resulting leaf level of the path.
3638
*
3639
* returns 0 if all went well and < 0 on failure.
3640
*/
3641
static noinline int split_leaf(struct btrfs_trans_handle *trans,
3642
struct btrfs_root *root,
3643
const struct btrfs_key *ins_key,
3644
struct btrfs_path *path, int data_size,
3645
int extend)
3646
{
3647
struct btrfs_disk_key disk_key;
3648
struct extent_buffer *l;
3649
u32 nritems;
3650
int mid;
3651
int slot;
3652
struct extent_buffer *right;
3653
struct btrfs_fs_info *fs_info = root->fs_info;
3654
int ret = 0;
3655
int wret;
3656
int split;
3657
int num_doubles = 0;
3658
int tried_avoid_double = 0;
3659
3660
l = path->nodes[0];
3661
slot = path->slots[0];
3662
if (extend && data_size + btrfs_item_size(l, slot) +
3663
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3664
return -EOVERFLOW;
3665
3666
/* first try to make some room by pushing left and right */
3667
if (data_size && path->nodes[1]) {
3668
int space_needed = data_size;
3669
3670
if (slot < btrfs_header_nritems(l))
3671
space_needed -= btrfs_leaf_free_space(l);
3672
3673
wret = push_leaf_right(trans, root, path, space_needed,
3674
space_needed, 0, 0);
3675
if (wret < 0)
3676
return wret;
3677
if (wret) {
3678
space_needed = data_size;
3679
if (slot > 0)
3680
space_needed -= btrfs_leaf_free_space(l);
3681
wret = push_leaf_left(trans, root, path, space_needed,
3682
space_needed, 0, (u32)-1);
3683
if (wret < 0)
3684
return wret;
3685
}
3686
l = path->nodes[0];
3687
3688
/* did the pushes work? */
3689
if (btrfs_leaf_free_space(l) >= data_size)
3690
return 0;
3691
}
3692
3693
if (!path->nodes[1]) {
3694
ret = insert_new_root(trans, root, path, 1);
3695
if (ret)
3696
return ret;
3697
}
3698
again:
3699
split = 1;
3700
l = path->nodes[0];
3701
slot = path->slots[0];
3702
nritems = btrfs_header_nritems(l);
3703
mid = (nritems + 1) / 2;
3704
3705
if (mid <= slot) {
3706
if (nritems == 1 ||
3707
leaf_space_used(l, mid, nritems - mid) + data_size >
3708
BTRFS_LEAF_DATA_SIZE(fs_info)) {
3709
if (slot >= nritems) {
3710
split = 0;
3711
} else {
3712
mid = slot;
3713
if (mid != nritems &&
3714
leaf_space_used(l, mid, nritems - mid) +
3715
data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3716
if (data_size && !tried_avoid_double)
3717
goto push_for_double;
3718
split = 2;
3719
}
3720
}
3721
}
3722
} else {
3723
if (leaf_space_used(l, 0, mid) + data_size >
3724
BTRFS_LEAF_DATA_SIZE(fs_info)) {
3725
if (!extend && data_size && slot == 0) {
3726
split = 0;
3727
} else if ((extend || !data_size) && slot == 0) {
3728
mid = 1;
3729
} else {
3730
mid = slot;
3731
if (mid != nritems &&
3732
leaf_space_used(l, mid, nritems - mid) +
3733
data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3734
if (data_size && !tried_avoid_double)
3735
goto push_for_double;
3736
split = 2;
3737
}
3738
}
3739
}
3740
}
3741
3742
if (split == 0)
3743
btrfs_cpu_key_to_disk(&disk_key, ins_key);
3744
else
3745
btrfs_item_key(l, &disk_key, mid);
3746
3747
/*
3748
* We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3749
* split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3750
* subclasses, which is 8 at the time of this patch, and we've maxed it
3751
* out. In the future we could add a
3752
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3753
* use BTRFS_NESTING_NEW_ROOT.
3754
*/
3755
right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
3756
&disk_key, 0, l->start, 0, 0,
3757
num_doubles ? BTRFS_NESTING_NEW_ROOT :
3758
BTRFS_NESTING_SPLIT);
3759
if (IS_ERR(right))
3760
return PTR_ERR(right);
3761
3762
root_add_used_bytes(root);
3763
3764
if (split == 0) {
3765
if (mid <= slot) {
3766
btrfs_set_header_nritems(right, 0);
3767
ret = insert_ptr(trans, path, &disk_key,
3768
right->start, path->slots[1] + 1, 1);
3769
if (ret < 0) {
3770
btrfs_tree_unlock(right);
3771
free_extent_buffer(right);
3772
return ret;
3773
}
3774
btrfs_tree_unlock(path->nodes[0]);
3775
free_extent_buffer(path->nodes[0]);
3776
path->nodes[0] = right;
3777
path->slots[0] = 0;
3778
path->slots[1] += 1;
3779
} else {
3780
btrfs_set_header_nritems(right, 0);
3781
ret = insert_ptr(trans, path, &disk_key,
3782
right->start, path->slots[1], 1);
3783
if (ret < 0) {
3784
btrfs_tree_unlock(right);
3785
free_extent_buffer(right);
3786
return ret;
3787
}
3788
btrfs_tree_unlock(path->nodes[0]);
3789
free_extent_buffer(path->nodes[0]);
3790
path->nodes[0] = right;
3791
path->slots[0] = 0;
3792
if (path->slots[1] == 0)
3793
fixup_low_keys(trans, path, &disk_key, 1);
3794
}
3795
/*
3796
* We create a new leaf 'right' for the required ins_len and
3797
* we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3798
* the content of ins_len to 'right'.
3799
*/
3800
return ret;
3801
}
3802
3803
ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3804
if (ret < 0) {
3805
btrfs_tree_unlock(right);
3806
free_extent_buffer(right);
3807
return ret;
3808
}
3809
3810
if (split == 2) {
3811
BUG_ON(num_doubles != 0);
3812
num_doubles++;
3813
goto again;
3814
}
3815
3816
return 0;
3817
3818
push_for_double:
3819
push_for_double_split(trans, root, path, data_size);
3820
tried_avoid_double = 1;
3821
if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3822
return 0;
3823
goto again;
3824
}
3825
3826
static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3827
struct btrfs_root *root,
3828
struct btrfs_path *path, int ins_len)
3829
{
3830
struct btrfs_key key;
3831
struct extent_buffer *leaf;
3832
struct btrfs_file_extent_item *fi;
3833
u64 extent_len = 0;
3834
u32 item_size;
3835
int ret;
3836
3837
leaf = path->nodes[0];
3838
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3839
3840
BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3841
key.type != BTRFS_RAID_STRIPE_KEY &&
3842
key.type != BTRFS_EXTENT_CSUM_KEY);
3843
3844
if (btrfs_leaf_free_space(leaf) >= ins_len)
3845
return 0;
3846
3847
item_size = btrfs_item_size(leaf, path->slots[0]);
3848
if (key.type == BTRFS_EXTENT_DATA_KEY) {
3849
fi = btrfs_item_ptr(leaf, path->slots[0],
3850
struct btrfs_file_extent_item);
3851
extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3852
}
3853
btrfs_release_path(path);
3854
3855
path->keep_locks = 1;
3856
path->search_for_split = 1;
3857
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3858
path->search_for_split = 0;
3859
if (ret > 0)
3860
ret = -EAGAIN;
3861
if (ret < 0)
3862
goto err;
3863
3864
ret = -EAGAIN;
3865
leaf = path->nodes[0];
3866
/* if our item isn't there, return now */
3867
if (item_size != btrfs_item_size(leaf, path->slots[0]))
3868
goto err;
3869
3870
/* the leaf has changed, it now has room. return now */
3871
if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3872
goto err;
3873
3874
if (key.type == BTRFS_EXTENT_DATA_KEY) {
3875
fi = btrfs_item_ptr(leaf, path->slots[0],
3876
struct btrfs_file_extent_item);
3877
if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3878
goto err;
3879
}
3880
3881
ret = split_leaf(trans, root, &key, path, ins_len, 1);
3882
if (ret)
3883
goto err;
3884
3885
path->keep_locks = 0;
3886
btrfs_unlock_up_safe(path, 1);
3887
return 0;
3888
err:
3889
path->keep_locks = 0;
3890
return ret;
3891
}
3892
3893
static noinline int split_item(struct btrfs_trans_handle *trans,
3894
struct btrfs_path *path,
3895
const struct btrfs_key *new_key,
3896
unsigned long split_offset)
3897
{
3898
struct extent_buffer *leaf;
3899
int orig_slot, slot;
3900
char *buf;
3901
u32 nritems;
3902
u32 item_size;
3903
u32 orig_offset;
3904
struct btrfs_disk_key disk_key;
3905
3906
leaf = path->nodes[0];
3907
/*
3908
* Shouldn't happen because the caller must have previously called
3909
* setup_leaf_for_split() to make room for the new item in the leaf.
3910
*/
3911
if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
3912
return -ENOSPC;
3913
3914
orig_slot = path->slots[0];
3915
orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3916
item_size = btrfs_item_size(leaf, path->slots[0]);
3917
3918
buf = kmalloc(item_size, GFP_NOFS);
3919
if (!buf)
3920
return -ENOMEM;
3921
3922
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3923
path->slots[0]), item_size);
3924
3925
slot = path->slots[0] + 1;
3926
nritems = btrfs_header_nritems(leaf);
3927
if (slot != nritems) {
3928
/* shift the items */
3929
memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
3930
}
3931
3932
btrfs_cpu_key_to_disk(&disk_key, new_key);
3933
btrfs_set_item_key(leaf, &disk_key, slot);
3934
3935
btrfs_set_item_offset(leaf, slot, orig_offset);
3936
btrfs_set_item_size(leaf, slot, item_size - split_offset);
3937
3938
btrfs_set_item_offset(leaf, orig_slot,
3939
orig_offset + item_size - split_offset);
3940
btrfs_set_item_size(leaf, orig_slot, split_offset);
3941
3942
btrfs_set_header_nritems(leaf, nritems + 1);
3943
3944
/* write the data for the start of the original item */
3945
write_extent_buffer(leaf, buf,
3946
btrfs_item_ptr_offset(leaf, path->slots[0]),
3947
split_offset);
3948
3949
/* write the data for the new item */
3950
write_extent_buffer(leaf, buf + split_offset,
3951
btrfs_item_ptr_offset(leaf, slot),
3952
item_size - split_offset);
3953
btrfs_mark_buffer_dirty(trans, leaf);
3954
3955
BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3956
kfree(buf);
3957
return 0;
3958
}
3959
3960
/*
3961
* This function splits a single item into two items,
3962
* giving 'new_key' to the new item and splitting the
3963
* old one at split_offset (from the start of the item).
3964
*
3965
* The path may be released by this operation. After
3966
* the split, the path is pointing to the old item. The
3967
* new item is going to be in the same node as the old one.
3968
*
3969
* Note, the item being split must be smaller enough to live alone on
3970
* a tree block with room for one extra struct btrfs_item
3971
*
3972
* This allows us to split the item in place, keeping a lock on the
3973
* leaf the entire time.
3974
*/
3975
int btrfs_split_item(struct btrfs_trans_handle *trans,
3976
struct btrfs_root *root,
3977
struct btrfs_path *path,
3978
const struct btrfs_key *new_key,
3979
unsigned long split_offset)
3980
{
3981
int ret;
3982
ret = setup_leaf_for_split(trans, root, path,
3983
sizeof(struct btrfs_item));
3984
if (ret)
3985
return ret;
3986
3987
ret = split_item(trans, path, new_key, split_offset);
3988
return ret;
3989
}
3990
3991
/*
3992
* make the item pointed to by the path smaller. new_size indicates
3993
* how small to make it, and from_end tells us if we just chop bytes
3994
* off the end of the item or if we shift the item to chop bytes off
3995
* the front.
3996
*/
3997
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
3998
const struct btrfs_path *path, u32 new_size, int from_end)
3999
{
4000
int slot;
4001
struct extent_buffer *leaf;
4002
u32 nritems;
4003
unsigned int data_end;
4004
unsigned int old_data_start;
4005
unsigned int old_size;
4006
unsigned int size_diff;
4007
int i;
4008
4009
leaf = path->nodes[0];
4010
slot = path->slots[0];
4011
4012
old_size = btrfs_item_size(leaf, slot);
4013
if (old_size == new_size)
4014
return;
4015
4016
nritems = btrfs_header_nritems(leaf);
4017
data_end = leaf_data_end(leaf);
4018
4019
old_data_start = btrfs_item_offset(leaf, slot);
4020
4021
size_diff = old_size - new_size;
4022
4023
BUG_ON(slot < 0);
4024
BUG_ON(slot >= nritems);
4025
4026
/*
4027
* item0..itemN ... dataN.offset..dataN.size .. data0.size
4028
*/
4029
/* first correct the data pointers */
4030
for (i = slot; i < nritems; i++) {
4031
u32 ioff;
4032
4033
ioff = btrfs_item_offset(leaf, i);
4034
btrfs_set_item_offset(leaf, i, ioff + size_diff);
4035
}
4036
4037
/* shift the data */
4038
if (from_end) {
4039
memmove_leaf_data(leaf, data_end + size_diff, data_end,
4040
old_data_start + new_size - data_end);
4041
} else {
4042
struct btrfs_disk_key disk_key;
4043
u64 offset;
4044
4045
btrfs_item_key(leaf, &disk_key, slot);
4046
4047
if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4048
unsigned long ptr;
4049
struct btrfs_file_extent_item *fi;
4050
4051
fi = btrfs_item_ptr(leaf, slot,
4052
struct btrfs_file_extent_item);
4053
fi = (struct btrfs_file_extent_item *)(
4054
(unsigned long)fi - size_diff);
4055
4056
if (btrfs_file_extent_type(leaf, fi) ==
4057
BTRFS_FILE_EXTENT_INLINE) {
4058
ptr = btrfs_item_ptr_offset(leaf, slot);
4059
memmove_extent_buffer(leaf, ptr,
4060
(unsigned long)fi,
4061
BTRFS_FILE_EXTENT_INLINE_DATA_START);
4062
}
4063
}
4064
4065
memmove_leaf_data(leaf, data_end + size_diff, data_end,
4066
old_data_start - data_end);
4067
4068
offset = btrfs_disk_key_offset(&disk_key);
4069
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4070
btrfs_set_item_key(leaf, &disk_key, slot);
4071
if (slot == 0)
4072
fixup_low_keys(trans, path, &disk_key, 1);
4073
}
4074
4075
btrfs_set_item_size(leaf, slot, new_size);
4076
btrfs_mark_buffer_dirty(trans, leaf);
4077
4078
if (btrfs_leaf_free_space(leaf) < 0) {
4079
btrfs_print_leaf(leaf);
4080
BUG();
4081
}
4082
}
4083
4084
/*
4085
* make the item pointed to by the path bigger, data_size is the added size.
4086
*/
4087
void btrfs_extend_item(struct btrfs_trans_handle *trans,
4088
const struct btrfs_path *path, u32 data_size)
4089
{
4090
int slot;
4091
struct extent_buffer *leaf;
4092
u32 nritems;
4093
unsigned int data_end;
4094
unsigned int old_data;
4095
unsigned int old_size;
4096
int i;
4097
4098
leaf = path->nodes[0];
4099
4100
nritems = btrfs_header_nritems(leaf);
4101
data_end = leaf_data_end(leaf);
4102
4103
if (btrfs_leaf_free_space(leaf) < data_size) {
4104
btrfs_print_leaf(leaf);
4105
BUG();
4106
}
4107
slot = path->slots[0];
4108
old_data = btrfs_item_data_end(leaf, slot);
4109
4110
BUG_ON(slot < 0);
4111
if (slot >= nritems) {
4112
btrfs_print_leaf(leaf);
4113
btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4114
slot, nritems);
4115
BUG();
4116
}
4117
4118
/*
4119
* item0..itemN ... dataN.offset..dataN.size .. data0.size
4120
*/
4121
/* first correct the data pointers */
4122
for (i = slot; i < nritems; i++) {
4123
u32 ioff;
4124
4125
ioff = btrfs_item_offset(leaf, i);
4126
btrfs_set_item_offset(leaf, i, ioff - data_size);
4127
}
4128
4129
/* shift the data */
4130
memmove_leaf_data(leaf, data_end - data_size, data_end,
4131
old_data - data_end);
4132
4133
data_end = old_data;
4134
old_size = btrfs_item_size(leaf, slot);
4135
btrfs_set_item_size(leaf, slot, old_size + data_size);
4136
btrfs_mark_buffer_dirty(trans, leaf);
4137
4138
if (btrfs_leaf_free_space(leaf) < 0) {
4139
btrfs_print_leaf(leaf);
4140
BUG();
4141
}
4142
}
4143
4144
/*
4145
* Make space in the node before inserting one or more items.
4146
*
4147
* @trans: transaction handle
4148
* @root: root we are inserting items to
4149
* @path: points to the leaf/slot where we are going to insert new items
4150
* @batch: information about the batch of items to insert
4151
*
4152
* Main purpose is to save stack depth by doing the bulk of the work in a
4153
* function that doesn't call btrfs_search_slot
4154
*/
4155
static void setup_items_for_insert(struct btrfs_trans_handle *trans,
4156
struct btrfs_root *root, struct btrfs_path *path,
4157
const struct btrfs_item_batch *batch)
4158
{
4159
struct btrfs_fs_info *fs_info = root->fs_info;
4160
int i;
4161
u32 nritems;
4162
unsigned int data_end;
4163
struct btrfs_disk_key disk_key;
4164
struct extent_buffer *leaf;
4165
int slot;
4166
u32 total_size;
4167
4168
/*
4169
* Before anything else, update keys in the parent and other ancestors
4170
* if needed, then release the write locks on them, so that other tasks
4171
* can use them while we modify the leaf.
4172
*/
4173
if (path->slots[0] == 0) {
4174
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4175
fixup_low_keys(trans, path, &disk_key, 1);
4176
}
4177
btrfs_unlock_up_safe(path, 1);
4178
4179
leaf = path->nodes[0];
4180
slot = path->slots[0];
4181
4182
nritems = btrfs_header_nritems(leaf);
4183
data_end = leaf_data_end(leaf);
4184
total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4185
4186
if (btrfs_leaf_free_space(leaf) < total_size) {
4187
btrfs_print_leaf(leaf);
4188
btrfs_crit(fs_info, "not enough freespace need %u have %d",
4189
total_size, btrfs_leaf_free_space(leaf));
4190
BUG();
4191
}
4192
4193
if (slot != nritems) {
4194
unsigned int old_data = btrfs_item_data_end(leaf, slot);
4195
4196
if (old_data < data_end) {
4197
btrfs_print_leaf(leaf);
4198
btrfs_crit(fs_info,
4199
"item at slot %d with data offset %u beyond data end of leaf %u",
4200
slot, old_data, data_end);
4201
BUG();
4202
}
4203
/*
4204
* item0..itemN ... dataN.offset..dataN.size .. data0.size
4205
*/
4206
/* first correct the data pointers */
4207
for (i = slot; i < nritems; i++) {
4208
u32 ioff;
4209
4210
ioff = btrfs_item_offset(leaf, i);
4211
btrfs_set_item_offset(leaf, i,
4212
ioff - batch->total_data_size);
4213
}
4214
/* shift the items */
4215
memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4216
4217
/* shift the data */
4218
memmove_leaf_data(leaf, data_end - batch->total_data_size,
4219
data_end, old_data - data_end);
4220
data_end = old_data;
4221
}
4222
4223
/* setup the item for the new data */
4224
for (i = 0; i < batch->nr; i++) {
4225
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4226
btrfs_set_item_key(leaf, &disk_key, slot + i);
4227
data_end -= batch->data_sizes[i];
4228
btrfs_set_item_offset(leaf, slot + i, data_end);
4229
btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
4230
}
4231
4232
btrfs_set_header_nritems(leaf, nritems + batch->nr);
4233
btrfs_mark_buffer_dirty(trans, leaf);
4234
4235
if (btrfs_leaf_free_space(leaf) < 0) {
4236
btrfs_print_leaf(leaf);
4237
BUG();
4238
}
4239
}
4240
4241
/*
4242
* Insert a new item into a leaf.
4243
*
4244
* @trans: Transaction handle.
4245
* @root: The root of the btree.
4246
* @path: A path pointing to the target leaf and slot.
4247
* @key: The key of the new item.
4248
* @data_size: The size of the data associated with the new key.
4249
*/
4250
void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
4251
struct btrfs_root *root,
4252
struct btrfs_path *path,
4253
const struct btrfs_key *key,
4254
u32 data_size)
4255
{
4256
struct btrfs_item_batch batch;
4257
4258
batch.keys = key;
4259
batch.data_sizes = &data_size;
4260
batch.total_data_size = data_size;
4261
batch.nr = 1;
4262
4263
setup_items_for_insert(trans, root, path, &batch);
4264
}
4265
4266
/*
4267
* Given a key and some data, insert items into the tree.
4268
* This does all the path init required, making room in the tree if needed.
4269
*
4270
* Returns: 0 on success
4271
* -EEXIST if the first key already exists
4272
* < 0 on other errors
4273
*/
4274
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4275
struct btrfs_root *root,
4276
struct btrfs_path *path,
4277
const struct btrfs_item_batch *batch)
4278
{
4279
int ret = 0;
4280
int slot;
4281
u32 total_size;
4282
4283
total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4284
ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4285
if (ret == 0)
4286
return -EEXIST;
4287
if (ret < 0)
4288
return ret;
4289
4290
slot = path->slots[0];
4291
BUG_ON(slot < 0);
4292
4293
setup_items_for_insert(trans, root, path, batch);
4294
return 0;
4295
}
4296
4297
/*
4298
* Given a key and some data, insert an item into the tree.
4299
* This does all the path init required, making room in the tree if needed.
4300
*/
4301
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4302
const struct btrfs_key *cpu_key, void *data,
4303
u32 data_size)
4304
{
4305
int ret = 0;
4306
BTRFS_PATH_AUTO_FREE(path);
4307
struct extent_buffer *leaf;
4308
unsigned long ptr;
4309
4310
path = btrfs_alloc_path();
4311
if (!path)
4312
return -ENOMEM;
4313
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4314
if (!ret) {
4315
leaf = path->nodes[0];
4316
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4317
write_extent_buffer(leaf, data, ptr, data_size);
4318
btrfs_mark_buffer_dirty(trans, leaf);
4319
}
4320
return ret;
4321
}
4322
4323
/*
4324
* This function duplicates an item, giving 'new_key' to the new item.
4325
* It guarantees both items live in the same tree leaf and the new item is
4326
* contiguous with the original item.
4327
*
4328
* This allows us to split a file extent in place, keeping a lock on the leaf
4329
* the entire time.
4330
*/
4331
int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4332
struct btrfs_root *root,
4333
struct btrfs_path *path,
4334
const struct btrfs_key *new_key)
4335
{
4336
struct extent_buffer *leaf;
4337
int ret;
4338
u32 item_size;
4339
4340
leaf = path->nodes[0];
4341
item_size = btrfs_item_size(leaf, path->slots[0]);
4342
ret = setup_leaf_for_split(trans, root, path,
4343
item_size + sizeof(struct btrfs_item));
4344
if (ret)
4345
return ret;
4346
4347
path->slots[0]++;
4348
btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
4349
leaf = path->nodes[0];
4350
memcpy_extent_buffer(leaf,
4351
btrfs_item_ptr_offset(leaf, path->slots[0]),
4352
btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4353
item_size);
4354
return 0;
4355
}
4356
4357
/*
4358
* delete the pointer from a given node.
4359
*
4360
* the tree should have been previously balanced so the deletion does not
4361
* empty a node.
4362
*
4363
* This is exported for use inside btrfs-progs, don't un-export it.
4364
*/
4365
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4366
struct btrfs_path *path, int level, int slot)
4367
{
4368
struct extent_buffer *parent = path->nodes[level];
4369
u32 nritems;
4370
int ret;
4371
4372
nritems = btrfs_header_nritems(parent);
4373
if (slot != nritems - 1) {
4374
if (level) {
4375
ret = btrfs_tree_mod_log_insert_move(parent, slot,
4376
slot + 1, nritems - slot - 1);
4377
if (ret < 0) {
4378
btrfs_abort_transaction(trans, ret);
4379
return ret;
4380
}
4381
}
4382
memmove_extent_buffer(parent,
4383
btrfs_node_key_ptr_offset(parent, slot),
4384
btrfs_node_key_ptr_offset(parent, slot + 1),
4385
sizeof(struct btrfs_key_ptr) *
4386
(nritems - slot - 1));
4387
} else if (level) {
4388
ret = btrfs_tree_mod_log_insert_key(parent, slot,
4389
BTRFS_MOD_LOG_KEY_REMOVE);
4390
if (ret < 0) {
4391
btrfs_abort_transaction(trans, ret);
4392
return ret;
4393
}
4394
}
4395
4396
nritems--;
4397
btrfs_set_header_nritems(parent, nritems);
4398
if (nritems == 0 && parent == root->node) {
4399
BUG_ON(btrfs_header_level(root->node) != 1);
4400
/* just turn the root into a leaf and break */
4401
btrfs_set_header_level(root->node, 0);
4402
} else if (slot == 0) {
4403
struct btrfs_disk_key disk_key;
4404
4405
btrfs_node_key(parent, &disk_key, 0);
4406
fixup_low_keys(trans, path, &disk_key, level + 1);
4407
}
4408
btrfs_mark_buffer_dirty(trans, parent);
4409
return 0;
4410
}
4411
4412
/*
4413
* a helper function to delete the leaf pointed to by path->slots[1] and
4414
* path->nodes[1].
4415
*
4416
* This deletes the pointer in path->nodes[1] and frees the leaf
4417
* block extent. zero is returned if it all worked out, < 0 otherwise.
4418
*
4419
* The path must have already been setup for deleting the leaf, including
4420
* all the proper balancing. path->nodes[1] must be locked.
4421
*/
4422
static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4423
struct btrfs_root *root,
4424
struct btrfs_path *path,
4425
struct extent_buffer *leaf)
4426
{
4427
int ret;
4428
4429
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4430
ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
4431
if (ret < 0)
4432
return ret;
4433
4434
/*
4435
* btrfs_free_extent is expensive, we want to make sure we
4436
* aren't holding any locks when we call it
4437
*/
4438
btrfs_unlock_up_safe(path, 0);
4439
4440
root_sub_used_bytes(root);
4441
4442
refcount_inc(&leaf->refs);
4443
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4444
free_extent_buffer_stale(leaf);
4445
if (ret < 0)
4446
btrfs_abort_transaction(trans, ret);
4447
4448
return ret;
4449
}
4450
/*
4451
* delete the item at the leaf level in path. If that empties
4452
* the leaf, remove it from the tree
4453
*/
4454
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4455
struct btrfs_path *path, int slot, int nr)
4456
{
4457
struct btrfs_fs_info *fs_info = root->fs_info;
4458
struct extent_buffer *leaf;
4459
int ret = 0;
4460
int wret;
4461
u32 nritems;
4462
4463
leaf = path->nodes[0];
4464
nritems = btrfs_header_nritems(leaf);
4465
4466
if (slot + nr != nritems) {
4467
const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4468
const int data_end = leaf_data_end(leaf);
4469
u32 dsize = 0;
4470
int i;
4471
4472
for (i = 0; i < nr; i++)
4473
dsize += btrfs_item_size(leaf, slot + i);
4474
4475
memmove_leaf_data(leaf, data_end + dsize, data_end,
4476
last_off - data_end);
4477
4478
for (i = slot + nr; i < nritems; i++) {
4479
u32 ioff;
4480
4481
ioff = btrfs_item_offset(leaf, i);
4482
btrfs_set_item_offset(leaf, i, ioff + dsize);
4483
}
4484
4485
memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4486
}
4487
btrfs_set_header_nritems(leaf, nritems - nr);
4488
nritems -= nr;
4489
4490
/* delete the leaf if we've emptied it */
4491
if (nritems == 0) {
4492
if (leaf == root->node) {
4493
btrfs_set_header_level(leaf, 0);
4494
} else {
4495
btrfs_clear_buffer_dirty(trans, leaf);
4496
ret = btrfs_del_leaf(trans, root, path, leaf);
4497
if (ret < 0)
4498
return ret;
4499
}
4500
} else {
4501
int used = leaf_space_used(leaf, 0, nritems);
4502
if (slot == 0) {
4503
struct btrfs_disk_key disk_key;
4504
4505
btrfs_item_key(leaf, &disk_key, 0);
4506
fixup_low_keys(trans, path, &disk_key, 1);
4507
}
4508
4509
/*
4510
* Try to delete the leaf if it is mostly empty. We do this by
4511
* trying to move all its items into its left and right neighbours.
4512
* If we can't move all the items, then we don't delete it - it's
4513
* not ideal, but future insertions might fill the leaf with more
4514
* items, or items from other leaves might be moved later into our
4515
* leaf due to deletions on those leaves.
4516
*/
4517
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4518
u32 min_push_space;
4519
4520
/* push_leaf_left fixes the path.
4521
* make sure the path still points to our leaf
4522
* for possible call to btrfs_del_ptr below
4523
*/
4524
slot = path->slots[1];
4525
refcount_inc(&leaf->refs);
4526
/*
4527
* We want to be able to at least push one item to the
4528
* left neighbour leaf, and that's the first item.
4529
*/
4530
min_push_space = sizeof(struct btrfs_item) +
4531
btrfs_item_size(leaf, 0);
4532
wret = push_leaf_left(trans, root, path, 0,
4533
min_push_space, 1, (u32)-1);
4534
if (wret < 0 && wret != -ENOSPC)
4535
ret = wret;
4536
4537
if (path->nodes[0] == leaf &&
4538
btrfs_header_nritems(leaf)) {
4539
/*
4540
* If we were not able to push all items from our
4541
* leaf to its left neighbour, then attempt to
4542
* either push all the remaining items to the
4543
* right neighbour or none. There's no advantage
4544
* in pushing only some items, instead of all, as
4545
* it's pointless to end up with a leaf having
4546
* too few items while the neighbours can be full
4547
* or nearly full.
4548
*/
4549
nritems = btrfs_header_nritems(leaf);
4550
min_push_space = leaf_space_used(leaf, 0, nritems);
4551
wret = push_leaf_right(trans, root, path, 0,
4552
min_push_space, 1, 0);
4553
if (wret < 0 && wret != -ENOSPC)
4554
ret = wret;
4555
}
4556
4557
if (btrfs_header_nritems(leaf) == 0) {
4558
path->slots[1] = slot;
4559
ret = btrfs_del_leaf(trans, root, path, leaf);
4560
if (ret < 0)
4561
return ret;
4562
free_extent_buffer(leaf);
4563
ret = 0;
4564
} else {
4565
/* if we're still in the path, make sure
4566
* we're dirty. Otherwise, one of the
4567
* push_leaf functions must have already
4568
* dirtied this buffer
4569
*/
4570
if (path->nodes[0] == leaf)
4571
btrfs_mark_buffer_dirty(trans, leaf);
4572
free_extent_buffer(leaf);
4573
}
4574
} else {
4575
btrfs_mark_buffer_dirty(trans, leaf);
4576
}
4577
}
4578
return ret;
4579
}
4580
4581
/*
4582
* A helper function to walk down the tree starting at min_key, and looking
4583
* for leaves that have a minimum transaction id.
4584
* This is used by the btree defrag code, and tree logging
4585
*
4586
* This does not cow, but it does stuff the starting key it finds back
4587
* into min_key, so you can call btrfs_search_slot with cow=1 on the
4588
* key and get a writable path.
4589
*
4590
* min_trans indicates the oldest transaction that you are interested
4591
* in walking through. Any nodes or leaves older than min_trans are
4592
* skipped over (without reading them).
4593
*
4594
* returns zero if something useful was found, < 0 on error and 1 if there
4595
* was nothing in the tree that matched the search criteria.
4596
*/
4597
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4598
struct btrfs_path *path,
4599
u64 min_trans)
4600
{
4601
struct extent_buffer *cur;
4602
int slot;
4603
int sret;
4604
u32 nritems;
4605
int level;
4606
int ret = 1;
4607
int keep_locks = path->keep_locks;
4608
4609
ASSERT(!path->nowait);
4610
ASSERT(path->lowest_level == 0);
4611
path->keep_locks = 1;
4612
again:
4613
cur = btrfs_read_lock_root_node(root);
4614
level = btrfs_header_level(cur);
4615
WARN_ON(path->nodes[level]);
4616
path->nodes[level] = cur;
4617
path->locks[level] = BTRFS_READ_LOCK;
4618
4619
if (btrfs_header_generation(cur) < min_trans) {
4620
ret = 1;
4621
goto out;
4622
}
4623
while (1) {
4624
nritems = btrfs_header_nritems(cur);
4625
level = btrfs_header_level(cur);
4626
sret = btrfs_bin_search(cur, 0, min_key, &slot);
4627
if (sret < 0) {
4628
ret = sret;
4629
goto out;
4630
}
4631
4632
/* At level 0 we're done, setup the path and exit. */
4633
if (level == 0) {
4634
if (slot >= nritems)
4635
goto find_next_key;
4636
ret = 0;
4637
path->slots[level] = slot;
4638
/* Save our key for returning back. */
4639
btrfs_item_key_to_cpu(cur, min_key, slot);
4640
goto out;
4641
}
4642
if (sret && slot > 0)
4643
slot--;
4644
/*
4645
* check this node pointer against the min_trans parameters.
4646
* If it is too old, skip to the next one.
4647
*/
4648
while (slot < nritems) {
4649
u64 gen;
4650
4651
gen = btrfs_node_ptr_generation(cur, slot);
4652
if (gen < min_trans) {
4653
slot++;
4654
continue;
4655
}
4656
break;
4657
}
4658
find_next_key:
4659
/*
4660
* we didn't find a candidate key in this node, walk forward
4661
* and find another one
4662
*/
4663
path->slots[level] = slot;
4664
if (slot >= nritems) {
4665
sret = btrfs_find_next_key(root, path, min_key, level,
4666
min_trans);
4667
if (sret == 0) {
4668
btrfs_release_path(path);
4669
goto again;
4670
} else {
4671
goto out;
4672
}
4673
}
4674
cur = btrfs_read_node_slot(cur, slot);
4675
if (IS_ERR(cur)) {
4676
ret = PTR_ERR(cur);
4677
goto out;
4678
}
4679
4680
btrfs_tree_read_lock(cur);
4681
4682
path->locks[level - 1] = BTRFS_READ_LOCK;
4683
path->nodes[level - 1] = cur;
4684
unlock_up(path, level, 1, 0, NULL);
4685
}
4686
out:
4687
path->keep_locks = keep_locks;
4688
if (ret == 0)
4689
btrfs_unlock_up_safe(path, 1);
4690
return ret;
4691
}
4692
4693
/*
4694
* this is similar to btrfs_next_leaf, but does not try to preserve
4695
* and fixup the path. It looks for and returns the next key in the
4696
* tree based on the current path and the min_trans parameters.
4697
*
4698
* 0 is returned if another key is found, < 0 if there are any errors
4699
* and 1 is returned if there are no higher keys in the tree
4700
*
4701
* path->keep_locks should be set to 1 on the search made before
4702
* calling this function.
4703
*/
4704
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4705
struct btrfs_key *key, int level, u64 min_trans)
4706
{
4707
int slot;
4708
struct extent_buffer *c;
4709
4710
WARN_ON(!path->keep_locks && !path->skip_locking);
4711
while (level < BTRFS_MAX_LEVEL) {
4712
if (!path->nodes[level])
4713
return 1;
4714
4715
slot = path->slots[level] + 1;
4716
c = path->nodes[level];
4717
next:
4718
if (slot >= btrfs_header_nritems(c)) {
4719
int ret;
4720
int orig_lowest;
4721
struct btrfs_key cur_key;
4722
if (level + 1 >= BTRFS_MAX_LEVEL ||
4723
!path->nodes[level + 1])
4724
return 1;
4725
4726
if (path->locks[level + 1] || path->skip_locking) {
4727
level++;
4728
continue;
4729
}
4730
4731
slot = btrfs_header_nritems(c) - 1;
4732
if (level == 0)
4733
btrfs_item_key_to_cpu(c, &cur_key, slot);
4734
else
4735
btrfs_node_key_to_cpu(c, &cur_key, slot);
4736
4737
orig_lowest = path->lowest_level;
4738
btrfs_release_path(path);
4739
path->lowest_level = level;
4740
ret = btrfs_search_slot(NULL, root, &cur_key, path,
4741
0, 0);
4742
path->lowest_level = orig_lowest;
4743
if (ret < 0)
4744
return ret;
4745
4746
c = path->nodes[level];
4747
slot = path->slots[level];
4748
if (ret == 0)
4749
slot++;
4750
goto next;
4751
}
4752
4753
if (level == 0)
4754
btrfs_item_key_to_cpu(c, key, slot);
4755
else {
4756
u64 gen = btrfs_node_ptr_generation(c, slot);
4757
4758
if (gen < min_trans) {
4759
slot++;
4760
goto next;
4761
}
4762
btrfs_node_key_to_cpu(c, key, slot);
4763
}
4764
return 0;
4765
}
4766
return 1;
4767
}
4768
4769
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4770
u64 time_seq)
4771
{
4772
int slot;
4773
int level;
4774
struct extent_buffer *c;
4775
struct extent_buffer *next;
4776
struct btrfs_fs_info *fs_info = root->fs_info;
4777
struct btrfs_key key;
4778
bool need_commit_sem = false;
4779
u32 nritems;
4780
int ret;
4781
int i;
4782
4783
/*
4784
* The nowait semantics are used only for write paths, where we don't
4785
* use the tree mod log and sequence numbers.
4786
*/
4787
if (time_seq)
4788
ASSERT(!path->nowait);
4789
4790
nritems = btrfs_header_nritems(path->nodes[0]);
4791
if (nritems == 0)
4792
return 1;
4793
4794
btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4795
again:
4796
level = 1;
4797
next = NULL;
4798
btrfs_release_path(path);
4799
4800
path->keep_locks = 1;
4801
4802
if (time_seq) {
4803
ret = btrfs_search_old_slot(root, &key, path, time_seq);
4804
} else {
4805
if (path->need_commit_sem) {
4806
path->need_commit_sem = 0;
4807
need_commit_sem = true;
4808
if (path->nowait) {
4809
if (!down_read_trylock(&fs_info->commit_root_sem)) {
4810
ret = -EAGAIN;
4811
goto done;
4812
}
4813
} else {
4814
down_read(&fs_info->commit_root_sem);
4815
}
4816
}
4817
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4818
}
4819
path->keep_locks = 0;
4820
4821
if (ret < 0)
4822
goto done;
4823
4824
nritems = btrfs_header_nritems(path->nodes[0]);
4825
/*
4826
* by releasing the path above we dropped all our locks. A balance
4827
* could have added more items next to the key that used to be
4828
* at the very end of the block. So, check again here and
4829
* advance the path if there are now more items available.
4830
*/
4831
if (nritems > 0 && path->slots[0] < nritems - 1) {
4832
if (ret == 0)
4833
path->slots[0]++;
4834
ret = 0;
4835
goto done;
4836
}
4837
/*
4838
* So the above check misses one case:
4839
* - after releasing the path above, someone has removed the item that
4840
* used to be at the very end of the block, and balance between leafs
4841
* gets another one with bigger key.offset to replace it.
4842
*
4843
* This one should be returned as well, or we can get leaf corruption
4844
* later(esp. in __btrfs_drop_extents()).
4845
*
4846
* And a bit more explanation about this check,
4847
* with ret > 0, the key isn't found, the path points to the slot
4848
* where it should be inserted, so the path->slots[0] item must be the
4849
* bigger one.
4850
*/
4851
if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4852
ret = 0;
4853
goto done;
4854
}
4855
4856
while (level < BTRFS_MAX_LEVEL) {
4857
if (!path->nodes[level]) {
4858
ret = 1;
4859
goto done;
4860
}
4861
4862
slot = path->slots[level] + 1;
4863
c = path->nodes[level];
4864
if (slot >= btrfs_header_nritems(c)) {
4865
level++;
4866
if (level == BTRFS_MAX_LEVEL) {
4867
ret = 1;
4868
goto done;
4869
}
4870
continue;
4871
}
4872
4873
4874
/*
4875
* Our current level is where we're going to start from, and to
4876
* make sure lockdep doesn't complain we need to drop our locks
4877
* and nodes from 0 to our current level.
4878
*/
4879
for (i = 0; i < level; i++) {
4880
if (path->locks[level]) {
4881
btrfs_tree_read_unlock(path->nodes[i]);
4882
path->locks[i] = 0;
4883
}
4884
free_extent_buffer(path->nodes[i]);
4885
path->nodes[i] = NULL;
4886
}
4887
4888
next = c;
4889
ret = read_block_for_search(root, path, &next, slot, &key);
4890
if (ret == -EAGAIN && !path->nowait)
4891
goto again;
4892
4893
if (ret < 0) {
4894
btrfs_release_path(path);
4895
goto done;
4896
}
4897
4898
if (!path->skip_locking) {
4899
ret = btrfs_try_tree_read_lock(next);
4900
if (!ret && path->nowait) {
4901
ret = -EAGAIN;
4902
goto done;
4903
}
4904
if (!ret && time_seq) {
4905
/*
4906
* If we don't get the lock, we may be racing
4907
* with push_leaf_left, holding that lock while
4908
* itself waiting for the leaf we've currently
4909
* locked. To solve this situation, we give up
4910
* on our lock and cycle.
4911
*/
4912
free_extent_buffer(next);
4913
btrfs_release_path(path);
4914
cond_resched();
4915
goto again;
4916
}
4917
if (!ret)
4918
btrfs_tree_read_lock(next);
4919
}
4920
break;
4921
}
4922
path->slots[level] = slot;
4923
while (1) {
4924
level--;
4925
path->nodes[level] = next;
4926
path->slots[level] = 0;
4927
if (!path->skip_locking)
4928
path->locks[level] = BTRFS_READ_LOCK;
4929
if (!level)
4930
break;
4931
4932
ret = read_block_for_search(root, path, &next, 0, &key);
4933
if (ret == -EAGAIN && !path->nowait)
4934
goto again;
4935
4936
if (ret < 0) {
4937
btrfs_release_path(path);
4938
goto done;
4939
}
4940
4941
if (!path->skip_locking) {
4942
if (path->nowait) {
4943
if (!btrfs_try_tree_read_lock(next)) {
4944
ret = -EAGAIN;
4945
goto done;
4946
}
4947
} else {
4948
btrfs_tree_read_lock(next);
4949
}
4950
}
4951
}
4952
ret = 0;
4953
done:
4954
unlock_up(path, 0, 1, 0, NULL);
4955
if (need_commit_sem) {
4956
int ret2;
4957
4958
path->need_commit_sem = 1;
4959
ret2 = finish_need_commit_sem_search(path);
4960
up_read(&fs_info->commit_root_sem);
4961
if (ret2)
4962
ret = ret2;
4963
}
4964
4965
return ret;
4966
}
4967
4968
int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4969
{
4970
path->slots[0]++;
4971
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4972
return btrfs_next_old_leaf(root, path, time_seq);
4973
return 0;
4974
}
4975
4976
/*
4977
* this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4978
* searching until it gets past min_objectid or finds an item of 'type'
4979
*
4980
* returns 0 if something is found, 1 if nothing was found and < 0 on error
4981
*/
4982
int btrfs_previous_item(struct btrfs_root *root,
4983
struct btrfs_path *path, u64 min_objectid,
4984
int type)
4985
{
4986
struct btrfs_key found_key;
4987
struct extent_buffer *leaf;
4988
u32 nritems;
4989
int ret;
4990
4991
while (1) {
4992
if (path->slots[0] == 0) {
4993
ret = btrfs_prev_leaf(root, path);
4994
if (ret != 0)
4995
return ret;
4996
} else {
4997
path->slots[0]--;
4998
}
4999
leaf = path->nodes[0];
5000
nritems = btrfs_header_nritems(leaf);
5001
if (nritems == 0)
5002
return 1;
5003
if (path->slots[0] == nritems)
5004
path->slots[0]--;
5005
5006
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5007
if (found_key.objectid < min_objectid)
5008
break;
5009
if (found_key.type == type)
5010
return 0;
5011
if (found_key.objectid == min_objectid &&
5012
found_key.type < type)
5013
break;
5014
}
5015
return 1;
5016
}
5017
5018
/*
5019
* search in extent tree to find a previous Metadata/Data extent item with
5020
* min objecitd.
5021
*
5022
* returns 0 if something is found, 1 if nothing was found and < 0 on error
5023
*/
5024
int btrfs_previous_extent_item(struct btrfs_root *root,
5025
struct btrfs_path *path, u64 min_objectid)
5026
{
5027
struct btrfs_key found_key;
5028
struct extent_buffer *leaf;
5029
u32 nritems;
5030
int ret;
5031
5032
while (1) {
5033
if (path->slots[0] == 0) {
5034
ret = btrfs_prev_leaf(root, path);
5035
if (ret != 0)
5036
return ret;
5037
} else {
5038
path->slots[0]--;
5039
}
5040
leaf = path->nodes[0];
5041
nritems = btrfs_header_nritems(leaf);
5042
if (nritems == 0)
5043
return 1;
5044
if (path->slots[0] == nritems)
5045
path->slots[0]--;
5046
5047
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5048
if (found_key.objectid < min_objectid)
5049
break;
5050
if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5051
found_key.type == BTRFS_METADATA_ITEM_KEY)
5052
return 0;
5053
if (found_key.objectid == min_objectid &&
5054
found_key.type < BTRFS_EXTENT_ITEM_KEY)
5055
break;
5056
}
5057
return 1;
5058
}
5059
5060
int __init btrfs_ctree_init(void)
5061
{
5062
btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
5063
if (!btrfs_path_cachep)
5064
return -ENOMEM;
5065
return 0;
5066
}
5067
5068
void __cold btrfs_ctree_exit(void)
5069
{
5070
kmem_cache_destroy(btrfs_path_cachep);
5071
}
5072
5073