Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/bcachefs/btree_iter.c
26282 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include "bcachefs.h"
4
#include "bkey_methods.h"
5
#include "bkey_buf.h"
6
#include "btree_cache.h"
7
#include "btree_iter.h"
8
#include "btree_journal_iter.h"
9
#include "btree_key_cache.h"
10
#include "btree_locking.h"
11
#include "btree_update.h"
12
#include "debug.h"
13
#include "error.h"
14
#include "extents.h"
15
#include "journal.h"
16
#include "journal_io.h"
17
#include "replicas.h"
18
#include "snapshot.h"
19
#include "super.h"
20
#include "trace.h"
21
22
#include <linux/random.h>
23
#include <linux/prefetch.h>
24
25
static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
26
static inline void btree_path_list_add(struct btree_trans *,
27
btree_path_idx_t, btree_path_idx_t);
28
29
static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
30
{
31
#ifdef TRACK_PATH_ALLOCATED
32
return iter->ip_allocated;
33
#else
34
return 0;
35
#endif
36
}
37
38
static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
39
static void bch2_trans_srcu_lock(struct btree_trans *);
40
41
static inline int __btree_path_cmp(const struct btree_path *l,
42
enum btree_id r_btree_id,
43
bool r_cached,
44
struct bpos r_pos,
45
unsigned r_level)
46
{
47
/*
48
* Must match lock ordering as defined by __bch2_btree_node_lock:
49
*/
50
return cmp_int(l->btree_id, r_btree_id) ?:
51
cmp_int((int) l->cached, (int) r_cached) ?:
52
bpos_cmp(l->pos, r_pos) ?:
53
-cmp_int(l->level, r_level);
54
}
55
56
static inline int btree_path_cmp(const struct btree_path *l,
57
const struct btree_path *r)
58
{
59
return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
60
}
61
62
static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
63
{
64
/* Are we iterating over keys in all snapshots? */
65
if (iter->flags & BTREE_ITER_all_snapshots) {
66
p = bpos_successor(p);
67
} else {
68
p = bpos_nosnap_successor(p);
69
p.snapshot = iter->snapshot;
70
}
71
72
return p;
73
}
74
75
static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
76
{
77
/* Are we iterating over keys in all snapshots? */
78
if (iter->flags & BTREE_ITER_all_snapshots) {
79
p = bpos_predecessor(p);
80
} else {
81
p = bpos_nosnap_predecessor(p);
82
p.snapshot = iter->snapshot;
83
}
84
85
return p;
86
}
87
88
static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
89
{
90
struct bpos pos = iter->pos;
91
92
if ((iter->flags & BTREE_ITER_is_extents) &&
93
!bkey_eq(pos, POS_MAX))
94
pos = bkey_successor(iter, pos);
95
return pos;
96
}
97
98
static inline bool btree_path_pos_before_node(struct btree_path *path,
99
struct btree *b)
100
{
101
return bpos_lt(path->pos, b->data->min_key);
102
}
103
104
static inline bool btree_path_pos_after_node(struct btree_path *path,
105
struct btree *b)
106
{
107
return bpos_gt(path->pos, b->key.k.p);
108
}
109
110
static inline bool btree_path_pos_in_node(struct btree_path *path,
111
struct btree *b)
112
{
113
return path->btree_id == b->c.btree_id &&
114
!btree_path_pos_before_node(path, b) &&
115
!btree_path_pos_after_node(path, b);
116
}
117
118
/* Debug: */
119
120
static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
121
struct btree_path *path)
122
{
123
struct bkey_cached *ck;
124
bool locked = btree_node_locked(path, 0);
125
126
if (!bch2_btree_node_relock(trans, path, 0))
127
return;
128
129
ck = (void *) path->l[0].b;
130
BUG_ON(ck->key.btree_id != path->btree_id ||
131
!bkey_eq(ck->key.pos, path->pos));
132
133
if (!locked)
134
btree_node_unlock(trans, path, 0);
135
}
136
137
static void __bch2_btree_path_verify_level(struct btree_trans *trans,
138
struct btree_path *path, unsigned level)
139
{
140
struct btree_path_level *l;
141
struct btree_node_iter tmp;
142
bool locked;
143
struct bkey_packed *p, *k;
144
struct printbuf buf1 = PRINTBUF;
145
struct printbuf buf2 = PRINTBUF;
146
struct printbuf buf3 = PRINTBUF;
147
const char *msg;
148
149
l = &path->l[level];
150
tmp = l->iter;
151
locked = btree_node_locked(path, level);
152
153
if (path->cached) {
154
if (!level)
155
__bch2_btree_path_verify_cached(trans, path);
156
return;
157
}
158
159
if (!btree_path_node(path, level))
160
return;
161
162
if (!bch2_btree_node_relock_notrace(trans, path, level))
163
return;
164
165
BUG_ON(!btree_path_pos_in_node(path, l->b));
166
167
bch2_btree_node_iter_verify(&l->iter, l->b);
168
169
/*
170
* For interior nodes, the iterator will have skipped past deleted keys:
171
*/
172
p = level
173
? bch2_btree_node_iter_prev(&tmp, l->b)
174
: bch2_btree_node_iter_prev_all(&tmp, l->b);
175
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
176
177
if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
178
msg = "before";
179
goto err;
180
}
181
182
if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
183
msg = "after";
184
goto err;
185
}
186
187
if (!locked)
188
btree_node_unlock(trans, path, level);
189
return;
190
err:
191
bch2_bpos_to_text(&buf1, path->pos);
192
193
if (p) {
194
struct bkey uk = bkey_unpack_key(l->b, p);
195
196
bch2_bkey_to_text(&buf2, &uk);
197
} else {
198
prt_printf(&buf2, "(none)");
199
}
200
201
if (k) {
202
struct bkey uk = bkey_unpack_key(l->b, k);
203
204
bch2_bkey_to_text(&buf3, &uk);
205
} else {
206
prt_printf(&buf3, "(none)");
207
}
208
209
panic("path should be %s key at level %u:\n"
210
"path pos %s\n"
211
"prev key %s\n"
212
"cur key %s\n",
213
msg, level, buf1.buf, buf2.buf, buf3.buf);
214
}
215
216
static void __bch2_btree_path_verify(struct btree_trans *trans,
217
struct btree_path *path)
218
{
219
struct bch_fs *c = trans->c;
220
221
for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
222
if (!path->l[i].b) {
223
BUG_ON(!path->cached &&
224
bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
225
break;
226
}
227
228
__bch2_btree_path_verify_level(trans, path, i);
229
}
230
231
bch2_btree_path_verify_locks(trans, path);
232
}
233
234
void __bch2_trans_verify_paths(struct btree_trans *trans)
235
{
236
struct btree_path *path;
237
unsigned iter;
238
239
trans_for_each_path(trans, path, iter)
240
__bch2_btree_path_verify(trans, path);
241
}
242
243
static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
244
{
245
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
246
247
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
248
(iter->flags & BTREE_ITER_all_snapshots));
249
250
BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
251
(iter->flags & BTREE_ITER_all_snapshots) &&
252
!btree_type_has_snapshot_field(iter->btree_id));
253
254
if (iter->update_path)
255
__bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
256
__bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
257
}
258
259
static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
260
{
261
BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
262
!iter->pos.snapshot);
263
264
BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
265
iter->pos.snapshot != iter->snapshot);
266
267
BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
268
!(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
269
(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
270
bkey_gt(iter->pos, iter->k.p)));
271
}
272
273
static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
274
struct btree_iter *iter, struct bkey_s_c k)
275
{
276
struct btree_iter copy;
277
struct bkey_s_c prev;
278
int ret = 0;
279
280
if (!(iter->flags & BTREE_ITER_filter_snapshots))
281
return 0;
282
283
if (bkey_err(k) || !k.k)
284
return 0;
285
286
BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
287
iter->snapshot,
288
k.k->p.snapshot));
289
290
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
291
BTREE_ITER_nopreserve|
292
BTREE_ITER_all_snapshots);
293
prev = bch2_btree_iter_prev(trans, &copy);
294
if (!prev.k)
295
goto out;
296
297
ret = bkey_err(prev);
298
if (ret)
299
goto out;
300
301
if (bkey_eq(prev.k->p, k.k->p) &&
302
bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
303
prev.k->p.snapshot) > 0) {
304
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
305
306
bch2_bkey_to_text(&buf1, k.k);
307
bch2_bkey_to_text(&buf2, prev.k);
308
309
panic("iter snap %u\n"
310
"k %s\n"
311
"prev %s\n",
312
iter->snapshot,
313
buf1.buf, buf2.buf);
314
}
315
out:
316
bch2_trans_iter_exit(trans, &copy);
317
return ret;
318
}
319
320
void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
321
struct bpos pos)
322
{
323
bch2_trans_verify_not_unlocked_or_in_restart(trans);
324
325
struct btree_path *path;
326
struct trans_for_each_path_inorder_iter iter;
327
struct printbuf buf = PRINTBUF;
328
329
btree_trans_sort_paths(trans);
330
331
trans_for_each_path_inorder(trans, path, iter) {
332
if (path->btree_id != id ||
333
!btree_node_locked(path, 0) ||
334
!path->should_be_locked)
335
continue;
336
337
if (!path->cached) {
338
if (bkey_ge(pos, path->l[0].b->data->min_key) &&
339
bkey_le(pos, path->l[0].b->key.k.p))
340
return;
341
} else {
342
if (bkey_eq(pos, path->pos))
343
return;
344
}
345
}
346
347
bch2_dump_trans_paths_updates(trans);
348
bch2_bpos_to_text(&buf, pos);
349
350
panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
351
}
352
353
static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
354
struct btree_path *path, unsigned l)
355
{
356
if (static_branch_unlikely(&bch2_debug_check_iterators))
357
__bch2_btree_path_verify_level(trans, path, l);
358
}
359
360
static inline void bch2_btree_path_verify(struct btree_trans *trans,
361
struct btree_path *path)
362
{
363
if (static_branch_unlikely(&bch2_debug_check_iterators))
364
__bch2_btree_path_verify(trans, path);
365
}
366
367
static inline void bch2_btree_iter_verify(struct btree_trans *trans,
368
struct btree_iter *iter)
369
{
370
if (static_branch_unlikely(&bch2_debug_check_iterators))
371
__bch2_btree_iter_verify(trans, iter);
372
}
373
374
static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
375
{
376
if (static_branch_unlikely(&bch2_debug_check_iterators))
377
__bch2_btree_iter_verify_entry_exit(iter);
378
}
379
380
static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
381
struct bkey_s_c k)
382
{
383
return static_branch_unlikely(&bch2_debug_check_iterators)
384
? __bch2_btree_iter_verify_ret(trans, iter, k)
385
: 0;
386
}
387
388
/* Btree path: fixups after btree updates */
389
390
static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
391
struct btree *b,
392
struct bset_tree *t,
393
struct bkey_packed *k)
394
{
395
struct btree_node_iter_set *set;
396
397
btree_node_iter_for_each(iter, set)
398
if (set->end == t->end_offset) {
399
set->k = __btree_node_key_to_offset(b, k);
400
bch2_btree_node_iter_sort(iter, b);
401
return;
402
}
403
404
bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
405
}
406
407
static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
408
struct btree *b,
409
struct bkey_packed *where)
410
{
411
struct btree_path_level *l = &path->l[b->c.level];
412
413
if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
414
return;
415
416
if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
417
bch2_btree_node_iter_advance(&l->iter, l->b);
418
}
419
420
void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
421
struct btree *b,
422
struct bkey_packed *where)
423
{
424
struct btree_path *path;
425
unsigned i;
426
427
trans_for_each_path_with_node(trans, b, path, i) {
428
__bch2_btree_path_fix_key_modified(path, b, where);
429
bch2_btree_path_verify_level(trans, path, b->c.level);
430
}
431
}
432
433
static void __bch2_btree_node_iter_fix(struct btree_path *path,
434
struct btree *b,
435
struct btree_node_iter *node_iter,
436
struct bset_tree *t,
437
struct bkey_packed *where,
438
unsigned clobber_u64s,
439
unsigned new_u64s)
440
{
441
const struct bkey_packed *end = btree_bkey_last(b, t);
442
struct btree_node_iter_set *set;
443
unsigned offset = __btree_node_key_to_offset(b, where);
444
int shift = new_u64s - clobber_u64s;
445
unsigned old_end = t->end_offset - shift;
446
unsigned orig_iter_pos = node_iter->data[0].k;
447
bool iter_current_key_modified =
448
orig_iter_pos >= offset &&
449
orig_iter_pos <= offset + clobber_u64s;
450
451
btree_node_iter_for_each(node_iter, set)
452
if (set->end == old_end)
453
goto found;
454
455
/* didn't find the bset in the iterator - might have to readd it: */
456
if (new_u64s &&
457
bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
458
bch2_btree_node_iter_push(node_iter, b, where, end);
459
goto fixup_done;
460
} else {
461
/* Iterator is after key that changed */
462
return;
463
}
464
found:
465
set->end = t->end_offset;
466
467
/* Iterator hasn't gotten to the key that changed yet: */
468
if (set->k < offset)
469
return;
470
471
if (new_u64s &&
472
bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
473
set->k = offset;
474
} else if (set->k < offset + clobber_u64s) {
475
set->k = offset + new_u64s;
476
if (set->k == set->end)
477
bch2_btree_node_iter_set_drop(node_iter, set);
478
} else {
479
/* Iterator is after key that changed */
480
set->k = (int) set->k + shift;
481
return;
482
}
483
484
bch2_btree_node_iter_sort(node_iter, b);
485
fixup_done:
486
if (node_iter->data[0].k != orig_iter_pos)
487
iter_current_key_modified = true;
488
489
/*
490
* When a new key is added, and the node iterator now points to that
491
* key, the iterator might have skipped past deleted keys that should
492
* come after the key the iterator now points to. We have to rewind to
493
* before those deleted keys - otherwise
494
* bch2_btree_node_iter_prev_all() breaks:
495
*/
496
if (!bch2_btree_node_iter_end(node_iter) &&
497
iter_current_key_modified &&
498
b->c.level) {
499
struct bkey_packed *k, *k2, *p;
500
501
k = bch2_btree_node_iter_peek_all(node_iter, b);
502
503
for_each_bset(b, t) {
504
bool set_pos = false;
505
506
if (node_iter->data[0].end == t->end_offset)
507
continue;
508
509
k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
510
511
while ((p = bch2_bkey_prev_all(b, t, k2)) &&
512
bkey_iter_cmp(b, k, p) < 0) {
513
k2 = p;
514
set_pos = true;
515
}
516
517
if (set_pos)
518
btree_node_iter_set_set_pos(node_iter,
519
b, t, k2);
520
}
521
}
522
}
523
524
void bch2_btree_node_iter_fix(struct btree_trans *trans,
525
struct btree_path *path,
526
struct btree *b,
527
struct btree_node_iter *node_iter,
528
struct bkey_packed *where,
529
unsigned clobber_u64s,
530
unsigned new_u64s)
531
{
532
struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
533
struct btree_path *linked;
534
unsigned i;
535
536
if (node_iter != &path->l[b->c.level].iter) {
537
__bch2_btree_node_iter_fix(path, b, node_iter, t,
538
where, clobber_u64s, new_u64s);
539
540
if (static_branch_unlikely(&bch2_debug_check_iterators))
541
bch2_btree_node_iter_verify(node_iter, b);
542
}
543
544
trans_for_each_path_with_node(trans, b, linked, i) {
545
__bch2_btree_node_iter_fix(linked, b,
546
&linked->l[b->c.level].iter, t,
547
where, clobber_u64s, new_u64s);
548
bch2_btree_path_verify_level(trans, linked, b->c.level);
549
}
550
}
551
552
/* Btree path level: pointer to a particular btree node and node iter */
553
554
static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
555
struct btree_path_level *l,
556
struct bkey *u,
557
struct bkey_packed *k)
558
{
559
if (unlikely(!k)) {
560
/*
561
* signal to bch2_btree_iter_peek_slot() that we're currently at
562
* a hole
563
*/
564
u->type = KEY_TYPE_deleted;
565
return bkey_s_c_null;
566
}
567
568
return bkey_disassemble(l->b, k, u);
569
}
570
571
static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
572
struct btree_path_level *l,
573
struct bkey *u)
574
{
575
return __btree_iter_unpack(c, l, u,
576
bch2_btree_node_iter_peek_all(&l->iter, l->b));
577
}
578
579
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
580
struct btree_path *path,
581
struct btree_path_level *l,
582
struct bkey *u)
583
{
584
struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
585
bch2_btree_node_iter_prev(&l->iter, l->b));
586
587
path->pos = k.k ? k.k->p : l->b->data->min_key;
588
trans->paths_sorted = false;
589
bch2_btree_path_verify_level(trans, path, l - path->l);
590
return k;
591
}
592
593
static inline bool btree_path_advance_to_pos(struct btree_path *path,
594
struct btree_path_level *l,
595
int max_advance)
596
{
597
struct bkey_packed *k;
598
int nr_advanced = 0;
599
600
while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
601
bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
602
if (max_advance > 0 && nr_advanced >= max_advance)
603
return false;
604
605
bch2_btree_node_iter_advance(&l->iter, l->b);
606
nr_advanced++;
607
}
608
609
return true;
610
}
611
612
static inline void __btree_path_level_init(struct btree_path *path,
613
unsigned level)
614
{
615
struct btree_path_level *l = &path->l[level];
616
617
bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
618
619
/*
620
* Iterators to interior nodes should always be pointed at the first non
621
* whiteout:
622
*/
623
if (level)
624
bch2_btree_node_iter_peek(&l->iter, l->b);
625
}
626
627
void bch2_btree_path_level_init(struct btree_trans *trans,
628
struct btree_path *path,
629
struct btree *b)
630
{
631
BUG_ON(path->cached);
632
633
EBUG_ON(!btree_path_pos_in_node(path, b));
634
635
path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
636
path->l[b->c.level].b = b;
637
__btree_path_level_init(path, b->c.level);
638
}
639
640
/* Btree path: fixups after btree node updates: */
641
642
static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
643
{
644
struct bch_fs *c = trans->c;
645
646
trans_for_each_update(trans, i)
647
if (!i->cached &&
648
i->level == b->c.level &&
649
i->btree_id == b->c.btree_id &&
650
bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
651
bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
652
i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
653
654
if (unlikely(trans->journal_replay_not_finished)) {
655
struct bkey_i *j_k =
656
bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
657
i->k->k.p);
658
659
if (j_k) {
660
i->old_k = j_k->k;
661
i->old_v = &j_k->v;
662
}
663
}
664
}
665
}
666
667
/*
668
* A btree node is being replaced - update the iterator to point to the new
669
* node:
670
*/
671
void bch2_trans_node_add(struct btree_trans *trans,
672
struct btree_path *path,
673
struct btree *b)
674
{
675
struct btree_path *prev;
676
677
BUG_ON(!btree_path_pos_in_node(path, b));
678
679
while ((prev = prev_btree_path(trans, path)) &&
680
btree_path_pos_in_node(prev, b))
681
path = prev;
682
683
for (;
684
path && btree_path_pos_in_node(path, b);
685
path = next_btree_path(trans, path))
686
if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
687
enum btree_node_locked_type t =
688
btree_lock_want(path, b->c.level);
689
690
if (t != BTREE_NODE_UNLOCKED) {
691
btree_node_unlock(trans, path, b->c.level);
692
six_lock_increment(&b->c.lock, (enum six_lock_type) t);
693
mark_btree_node_locked(trans, path, b->c.level, t);
694
}
695
696
bch2_btree_path_level_init(trans, path, b);
697
}
698
699
bch2_trans_revalidate_updates_in_node(trans, b);
700
}
701
702
void bch2_trans_node_drop(struct btree_trans *trans,
703
struct btree *b)
704
{
705
struct btree_path *path;
706
unsigned i, level = b->c.level;
707
708
trans_for_each_path(trans, path, i)
709
if (path->l[level].b == b) {
710
btree_node_unlock(trans, path, level);
711
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
712
}
713
}
714
715
/*
716
* A btree node has been modified in such a way as to invalidate iterators - fix
717
* them:
718
*/
719
void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
720
{
721
struct btree_path *path;
722
unsigned i;
723
724
trans_for_each_path_with_node(trans, b, path, i)
725
__btree_path_level_init(path, b->c.level);
726
727
bch2_trans_revalidate_updates_in_node(trans, b);
728
}
729
730
/* Btree path: traverse, set_pos: */
731
732
static inline int btree_path_lock_root(struct btree_trans *trans,
733
struct btree_path *path,
734
unsigned depth_want,
735
unsigned long trace_ip)
736
{
737
struct bch_fs *c = trans->c;
738
struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
739
enum six_lock_type lock_type;
740
unsigned i;
741
int ret;
742
743
EBUG_ON(path->nodes_locked);
744
745
while (1) {
746
struct btree *b = READ_ONCE(r->b);
747
if (unlikely(!b)) {
748
BUG_ON(!r->error);
749
return r->error;
750
}
751
752
path->level = READ_ONCE(b->c.level);
753
754
if (unlikely(path->level < depth_want)) {
755
/*
756
* the root is at a lower depth than the depth we want:
757
* got to the end of the btree, or we're walking nodes
758
* greater than some depth and there are no nodes >=
759
* that depth
760
*/
761
path->level = depth_want;
762
for (i = path->level; i < BTREE_MAX_DEPTH; i++)
763
path->l[i].b = NULL;
764
return 1;
765
}
766
767
lock_type = __btree_lock_want(path, path->level);
768
ret = btree_node_lock(trans, path, &b->c,
769
path->level, lock_type, trace_ip);
770
if (unlikely(ret)) {
771
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
772
return ret;
773
BUG();
774
}
775
776
if (likely(b == READ_ONCE(r->b) &&
777
b->c.level == path->level &&
778
!race_fault())) {
779
for (i = 0; i < path->level; i++)
780
path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
781
path->l[path->level].b = b;
782
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
783
path->l[i].b = NULL;
784
785
mark_btree_node_locked(trans, path, path->level,
786
(enum btree_node_locked_type) lock_type);
787
bch2_btree_path_level_init(trans, path, b);
788
return 0;
789
}
790
791
six_unlock_type(&b->c.lock, lock_type);
792
}
793
}
794
795
noinline
796
static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
797
{
798
struct bch_fs *c = trans->c;
799
struct btree_path_level *l = path_l(path);
800
struct btree_node_iter node_iter = l->iter;
801
struct bkey_packed *k;
802
struct bkey_buf tmp;
803
unsigned nr = test_bit(BCH_FS_started, &c->flags)
804
? (path->level > 1 ? 0 : 2)
805
: (path->level > 1 ? 1 : 16);
806
bool was_locked = btree_node_locked(path, path->level);
807
int ret = 0;
808
809
bch2_bkey_buf_init(&tmp);
810
811
while (nr-- && !ret) {
812
if (!bch2_btree_node_relock(trans, path, path->level))
813
break;
814
815
bch2_btree_node_iter_advance(&node_iter, l->b);
816
k = bch2_btree_node_iter_peek(&node_iter, l->b);
817
if (!k)
818
break;
819
820
bch2_bkey_buf_unpack(&tmp, c, l->b, k);
821
ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
822
path->level - 1);
823
}
824
825
if (!was_locked)
826
btree_node_unlock(trans, path, path->level);
827
828
bch2_bkey_buf_exit(&tmp, c);
829
return ret;
830
}
831
832
static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
833
struct btree_and_journal_iter *jiter)
834
{
835
struct bch_fs *c = trans->c;
836
struct bkey_s_c k;
837
struct bkey_buf tmp;
838
unsigned nr = test_bit(BCH_FS_started, &c->flags)
839
? (path->level > 1 ? 0 : 2)
840
: (path->level > 1 ? 1 : 16);
841
bool was_locked = btree_node_locked(path, path->level);
842
int ret = 0;
843
844
bch2_bkey_buf_init(&tmp);
845
846
jiter->fail_if_too_many_whiteouts = true;
847
848
while (nr-- && !ret) {
849
if (!bch2_btree_node_relock(trans, path, path->level))
850
break;
851
852
bch2_btree_and_journal_iter_advance(jiter);
853
k = bch2_btree_and_journal_iter_peek(jiter);
854
if (!k.k)
855
break;
856
857
bch2_bkey_buf_reassemble(&tmp, c, k);
858
ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
859
path->level - 1);
860
}
861
862
if (!was_locked)
863
btree_node_unlock(trans, path, path->level);
864
865
bch2_bkey_buf_exit(&tmp, c);
866
return ret;
867
}
868
869
static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
870
struct btree_path *path,
871
unsigned plevel, struct btree *b)
872
{
873
struct btree_path_level *l = &path->l[plevel];
874
bool locked = btree_node_locked(path, plevel);
875
struct bkey_packed *k;
876
struct bch_btree_ptr_v2 *bp;
877
878
if (!bch2_btree_node_relock(trans, path, plevel))
879
return;
880
881
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
882
BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
883
884
bp = (void *) bkeyp_val(&l->b->format, k);
885
bp->mem_ptr = (unsigned long)b;
886
887
if (!locked)
888
btree_node_unlock(trans, path, plevel);
889
}
890
891
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
892
struct btree_path *path,
893
unsigned flags)
894
{
895
struct bch_fs *c = trans->c;
896
struct btree_path_level *l = path_l(path);
897
struct btree_and_journal_iter jiter;
898
struct bkey_s_c k;
899
int ret = 0;
900
901
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
902
903
k = bch2_btree_and_journal_iter_peek(&jiter);
904
if (!k.k) {
905
struct printbuf buf = PRINTBUF;
906
907
prt_str(&buf, "node not found at pos ");
908
bch2_bpos_to_text(&buf, path->pos);
909
prt_str(&buf, " at btree ");
910
bch2_btree_pos_to_text(&buf, c, l->b);
911
912
ret = bch2_fs_topology_error(c, "%s", buf.buf);
913
printbuf_exit(&buf);
914
goto err;
915
}
916
917
bkey_reassemble(&trans->btree_path_down, k);
918
919
if ((flags & BTREE_ITER_prefetch) &&
920
c->opts.btree_node_prefetch)
921
ret = btree_path_prefetch_j(trans, path, &jiter);
922
923
err:
924
bch2_btree_and_journal_iter_exit(&jiter);
925
return ret;
926
}
927
928
static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
929
struct btree_path *path)
930
{
931
struct bch_fs *c = trans->c;
932
struct printbuf buf = PRINTBUF;
933
934
prt_str(&buf, "node not found at pos ");
935
bch2_bpos_to_text(&buf, path->pos);
936
prt_str(&buf, " within parent node ");
937
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
938
939
bch2_fs_fatal_error(c, "%s", buf.buf);
940
printbuf_exit(&buf);
941
return bch_err_throw(c, btree_need_topology_repair);
942
}
943
944
static __always_inline int btree_path_down(struct btree_trans *trans,
945
struct btree_path *path,
946
unsigned flags,
947
unsigned long trace_ip)
948
{
949
struct bch_fs *c = trans->c;
950
struct btree_path_level *l = path_l(path);
951
struct btree *b;
952
unsigned level = path->level - 1;
953
enum six_lock_type lock_type = __btree_lock_want(path, level);
954
int ret;
955
956
EBUG_ON(!btree_node_locked(path, path->level));
957
958
if (unlikely(trans->journal_replay_not_finished)) {
959
ret = btree_node_iter_and_journal_peek(trans, path, flags);
960
if (ret)
961
return ret;
962
} else {
963
struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
964
if (unlikely(!k))
965
return btree_node_missing_err(trans, path);
966
967
bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
968
969
if (unlikely((flags & BTREE_ITER_prefetch)) &&
970
c->opts.btree_node_prefetch) {
971
ret = btree_path_prefetch(trans, path);
972
if (ret)
973
return ret;
974
}
975
}
976
977
b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
978
level, lock_type, trace_ip);
979
ret = PTR_ERR_OR_ZERO(b);
980
if (unlikely(ret))
981
return ret;
982
983
if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
984
likely(!trans->journal_replay_not_finished &&
985
trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
986
btree_node_mem_ptr_set(trans, path, level + 1, b);
987
988
if (btree_node_read_locked(path, level + 1))
989
btree_node_unlock(trans, path, level + 1);
990
991
mark_btree_node_locked(trans, path, level,
992
(enum btree_node_locked_type) lock_type);
993
path->level = level;
994
bch2_btree_path_level_init(trans, path, b);
995
996
bch2_btree_path_verify_locks(trans, path);
997
return 0;
998
}
999
1000
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
1001
{
1002
struct bch_fs *c = trans->c;
1003
struct btree_path *path;
1004
unsigned long trace_ip = _RET_IP_;
1005
unsigned i;
1006
int ret = 0;
1007
1008
if (trans->in_traverse_all)
1009
return bch_err_throw(c, transaction_restart_in_traverse_all);
1010
1011
trans->in_traverse_all = true;
1012
retry_all:
1013
trans->restarted = 0;
1014
trans->last_restarted_ip = 0;
1015
1016
trans_for_each_path(trans, path, i)
1017
path->should_be_locked = false;
1018
1019
btree_trans_sort_paths(trans);
1020
1021
bch2_trans_unlock(trans);
1022
cond_resched();
1023
trans_set_locked(trans, false);
1024
1025
if (unlikely(trans->memory_allocation_failure)) {
1026
struct closure cl;
1027
1028
closure_init_stack(&cl);
1029
1030
do {
1031
ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
1032
closure_sync(&cl);
1033
} while (ret);
1034
}
1035
1036
/* Now, redo traversals in correct order: */
1037
i = 0;
1038
while (i < trans->nr_sorted) {
1039
btree_path_idx_t idx = trans->sorted[i];
1040
1041
/*
1042
* Traversing a path can cause another path to be added at about
1043
* the same position:
1044
*/
1045
if (trans->paths[idx].uptodate) {
1046
__btree_path_get(trans, &trans->paths[idx], false);
1047
ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
1048
__btree_path_put(trans, &trans->paths[idx], false);
1049
1050
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1051
bch2_err_matches(ret, ENOMEM))
1052
goto retry_all;
1053
if (ret)
1054
goto err;
1055
} else {
1056
i++;
1057
}
1058
}
1059
1060
/*
1061
* We used to assert that all paths had been traversed here
1062
* (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
1063
* path->should_be_locked is not set yet, we might have unlocked and
1064
* then failed to relock a path - that's fine.
1065
*/
1066
err:
1067
bch2_btree_cache_cannibalize_unlock(trans);
1068
1069
trans->in_traverse_all = false;
1070
1071
trace_and_count(c, trans_traverse_all, trans, trace_ip);
1072
return ret;
1073
}
1074
1075
static inline bool btree_path_check_pos_in_node(struct btree_path *path,
1076
unsigned l, int check_pos)
1077
{
1078
if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1079
return false;
1080
if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1081
return false;
1082
return true;
1083
}
1084
1085
static inline bool btree_path_good_node(struct btree_trans *trans,
1086
struct btree_path *path,
1087
unsigned l, int check_pos)
1088
{
1089
return is_btree_node(path, l) &&
1090
bch2_btree_node_relock(trans, path, l) &&
1091
btree_path_check_pos_in_node(path, l, check_pos);
1092
}
1093
1094
static void btree_path_set_level_down(struct btree_trans *trans,
1095
struct btree_path *path,
1096
unsigned new_level)
1097
{
1098
unsigned l;
1099
1100
path->level = new_level;
1101
1102
for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1103
if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1104
btree_node_unlock(trans, path, l);
1105
1106
btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
1107
bch2_btree_path_verify(trans, path);
1108
}
1109
1110
static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
1111
struct btree_path *path,
1112
int check_pos)
1113
{
1114
unsigned i, l = path->level;
1115
again:
1116
while (btree_path_node(path, l) &&
1117
!btree_path_good_node(trans, path, l, check_pos))
1118
__btree_path_set_level_up(trans, path, l++);
1119
1120
/* If we need intent locks, take them too: */
1121
for (i = l + 1;
1122
i < path->locks_want && btree_path_node(path, i);
1123
i++)
1124
if (!bch2_btree_node_relock(trans, path, i)) {
1125
while (l <= i)
1126
__btree_path_set_level_up(trans, path, l++);
1127
goto again;
1128
}
1129
1130
return l;
1131
}
1132
1133
static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1134
struct btree_path *path,
1135
int check_pos)
1136
{
1137
return likely(btree_node_locked(path, path->level) &&
1138
btree_path_check_pos_in_node(path, path->level, check_pos))
1139
? path->level
1140
: __btree_path_up_until_good_node(trans, path, check_pos);
1141
}
1142
1143
/*
1144
* This is the main state machine for walking down the btree - walks down to a
1145
* specified depth
1146
*
1147
* Returns 0 on success, -EIO on error (error reading in a btree node).
1148
*
1149
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
1150
* stashed in the iterator and returned from bch2_trans_exit().
1151
*/
1152
int bch2_btree_path_traverse_one(struct btree_trans *trans,
1153
btree_path_idx_t path_idx,
1154
unsigned flags,
1155
unsigned long trace_ip)
1156
{
1157
struct btree_path *path = &trans->paths[path_idx];
1158
unsigned depth_want = path->level;
1159
int ret = -((int) trans->restarted);
1160
1161
if (unlikely(ret))
1162
goto out;
1163
1164
if (unlikely(!trans->srcu_held))
1165
bch2_trans_srcu_lock(trans);
1166
1167
trace_btree_path_traverse_start(trans, path);
1168
1169
/*
1170
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
1171
* and re-traverse the path without a transaction restart:
1172
*/
1173
if (path->should_be_locked) {
1174
ret = bch2_btree_path_relock(trans, path, trace_ip);
1175
goto out;
1176
}
1177
1178
if (path->cached) {
1179
ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
1180
goto out;
1181
}
1182
1183
path = &trans->paths[path_idx];
1184
1185
if (unlikely(path->level >= BTREE_MAX_DEPTH))
1186
goto out_uptodate;
1187
1188
path->level = btree_path_up_until_good_node(trans, path, 0);
1189
unsigned max_level = path->level;
1190
1191
EBUG_ON(btree_path_node(path, path->level) &&
1192
!btree_node_locked(path, path->level));
1193
1194
/*
1195
* Note: path->nodes[path->level] may be temporarily NULL here - that
1196
* would indicate to other code that we got to the end of the btree,
1197
* here it indicates that relocking the root failed - it's critical that
1198
* btree_path_lock_root() comes next and that it can't fail
1199
*/
1200
while (path->level > depth_want) {
1201
ret = btree_path_node(path, path->level)
1202
? btree_path_down(trans, path, flags, trace_ip)
1203
: btree_path_lock_root(trans, path, depth_want, trace_ip);
1204
if (unlikely(ret)) {
1205
if (ret == 1) {
1206
/*
1207
* No nodes at this level - got to the end of
1208
* the btree:
1209
*/
1210
ret = 0;
1211
goto out;
1212
}
1213
1214
__bch2_btree_path_unlock(trans, path);
1215
path->level = depth_want;
1216
path->l[path->level].b = ERR_PTR(ret);
1217
goto out;
1218
}
1219
}
1220
1221
if (unlikely(max_level > path->level)) {
1222
struct btree_path *linked;
1223
unsigned iter;
1224
1225
trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
1226
for (unsigned j = path->level + 1; j < max_level; j++)
1227
linked->l[j] = path->l[j];
1228
}
1229
1230
out_uptodate:
1231
path->uptodate = BTREE_ITER_UPTODATE;
1232
trace_btree_path_traverse_end(trans, path);
1233
out:
1234
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
1235
panic("ret %s (%i) trans->restarted %s (%i)\n",
1236
bch2_err_str(ret), ret,
1237
bch2_err_str(trans->restarted), trans->restarted);
1238
bch2_btree_path_verify(trans, path);
1239
return ret;
1240
}
1241
1242
static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1243
struct btree_path *src)
1244
{
1245
unsigned i, offset = offsetof(struct btree_path, pos);
1246
1247
memcpy((void *) dst + offset,
1248
(void *) src + offset,
1249
sizeof(struct btree_path) - offset);
1250
1251
for (i = 0; i < BTREE_MAX_DEPTH; i++) {
1252
unsigned t = btree_node_locked_type(dst, i);
1253
1254
if (t != BTREE_NODE_UNLOCKED)
1255
six_lock_increment(&dst->l[i].b->c.lock, t);
1256
}
1257
}
1258
1259
static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
1260
bool intent, unsigned long ip)
1261
{
1262
btree_path_idx_t new = btree_path_alloc(trans, src);
1263
btree_path_copy(trans, trans->paths + new, trans->paths + src);
1264
__btree_path_get(trans, trans->paths + new, intent);
1265
#ifdef TRACK_PATH_ALLOCATED
1266
trans->paths[new].ip_allocated = ip;
1267
#endif
1268
return new;
1269
}
1270
1271
__flatten
1272
btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
1273
btree_path_idx_t path, bool intent, unsigned long ip)
1274
{
1275
struct btree_path *old = trans->paths + path;
1276
__btree_path_put(trans, trans->paths + path, intent);
1277
path = btree_path_clone(trans, path, intent, ip);
1278
trace_btree_path_clone(trans, old, trans->paths + path);
1279
trans->paths[path].preserve = false;
1280
return path;
1281
}
1282
1283
btree_path_idx_t __must_check
1284
__bch2_btree_path_set_pos(struct btree_trans *trans,
1285
btree_path_idx_t path_idx, struct bpos new_pos,
1286
bool intent, unsigned long ip)
1287
{
1288
int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
1289
1290
bch2_trans_verify_not_unlocked_or_in_restart(trans);
1291
EBUG_ON(!trans->paths[path_idx].ref);
1292
1293
trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
1294
1295
path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
1296
1297
struct btree_path *path = trans->paths + path_idx;
1298
path->pos = new_pos;
1299
trans->paths_sorted = false;
1300
1301
if (unlikely(path->cached)) {
1302
btree_node_unlock(trans, path, 0);
1303
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1304
btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
1305
goto out;
1306
}
1307
1308
unsigned level = btree_path_up_until_good_node(trans, path, cmp);
1309
1310
if (btree_path_node(path, level)) {
1311
struct btree_path_level *l = &path->l[level];
1312
1313
BUG_ON(!btree_node_locked(path, level));
1314
/*
1315
* We might have to skip over many keys, or just a few: try
1316
* advancing the node iterator, and if we have to skip over too
1317
* many keys just reinit it (or if we're rewinding, since that
1318
* is expensive).
1319
*/
1320
if (cmp < 0 ||
1321
!btree_path_advance_to_pos(path, l, 8))
1322
bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
1323
1324
/*
1325
* Iterators to interior nodes should always be pointed at the first non
1326
* whiteout:
1327
*/
1328
if (unlikely(level))
1329
bch2_btree_node_iter_peek(&l->iter, l->b);
1330
}
1331
1332
if (unlikely(level != path->level)) {
1333
btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
1334
__bch2_btree_path_unlock(trans, path);
1335
}
1336
out:
1337
bch2_btree_path_verify(trans, path);
1338
return path_idx;
1339
}
1340
1341
/* Btree path: main interface: */
1342
1343
static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1344
{
1345
struct btree_path *sib;
1346
1347
sib = prev_btree_path(trans, path);
1348
if (sib && !btree_path_cmp(sib, path))
1349
return sib;
1350
1351
sib = next_btree_path(trans, path);
1352
if (sib && !btree_path_cmp(sib, path))
1353
return sib;
1354
1355
return NULL;
1356
}
1357
1358
static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1359
{
1360
struct btree_path *sib;
1361
1362
sib = prev_btree_path(trans, path);
1363
if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1364
return sib;
1365
1366
sib = next_btree_path(trans, path);
1367
if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1368
return sib;
1369
1370
return NULL;
1371
}
1372
1373
static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
1374
{
1375
__bch2_btree_path_unlock(trans, trans->paths + path);
1376
btree_path_list_remove(trans, trans->paths + path);
1377
__clear_bit(path, trans->paths_allocated);
1378
}
1379
1380
static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
1381
{
1382
unsigned l = path->level;
1383
1384
do {
1385
if (!btree_path_node(path, l))
1386
break;
1387
1388
if (!is_btree_node(path, l))
1389
return false;
1390
1391
if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
1392
return false;
1393
1394
l++;
1395
} while (l < path->locks_want);
1396
1397
return true;
1398
}
1399
1400
void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
1401
{
1402
struct btree_path *path = trans->paths + path_idx, *dup = NULL;
1403
1404
if (!__btree_path_put(trans, path, intent))
1405
return;
1406
1407
if (!path->preserve && !path->should_be_locked)
1408
goto free;
1409
1410
dup = path->preserve
1411
? have_path_at_pos(trans, path)
1412
: have_node_at_pos(trans, path);
1413
if (!dup)
1414
return;
1415
1416
/*
1417
* If we need this path locked, the duplicate also has te be locked
1418
* before we free this one:
1419
*/
1420
if (path->should_be_locked &&
1421
!dup->should_be_locked &&
1422
!trans->restarted) {
1423
if (!(trans->locked
1424
? bch2_btree_path_relock_norestart(trans, dup)
1425
: bch2_btree_path_can_relock(trans, dup)))
1426
return;
1427
1428
dup->should_be_locked = true;
1429
}
1430
1431
BUG_ON(path->should_be_locked &&
1432
!trans->restarted &&
1433
trans->locked &&
1434
!btree_node_locked(dup, dup->level));
1435
1436
path->should_be_locked = false;
1437
dup->preserve |= path->preserve;
1438
free:
1439
trace_btree_path_free(trans, path_idx, dup);
1440
__bch2_path_free(trans, path_idx);
1441
}
1442
1443
void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
1444
{
1445
panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
1446
trans->restart_count, restart_count,
1447
(void *) trans->last_begin_ip);
1448
}
1449
1450
static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
1451
{
1452
#ifdef CONFIG_BCACHEFS_DEBUG
1453
struct printbuf buf = PRINTBUF;
1454
bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
1455
panic("in transaction restart: %s, last restarted by\n%s",
1456
bch2_err_str(trans->restarted),
1457
buf.buf);
1458
#else
1459
panic("in transaction restart: %s, last restarted by %pS\n",
1460
bch2_err_str(trans->restarted),
1461
(void *) trans->last_restarted_ip);
1462
#endif
1463
}
1464
1465
void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
1466
{
1467
if (trans->restarted)
1468
bch2_trans_in_restart_error(trans);
1469
1470
if (!trans->locked)
1471
panic("trans should be locked, unlocked by %pS\n",
1472
(void *) trans->last_unlock_ip);
1473
1474
BUG();
1475
}
1476
1477
noinline __cold
1478
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1479
{
1480
prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
1481
trans->nr_updates, trans->fn, trans->journal_res.seq);
1482
printbuf_indent_add(buf, 2);
1483
1484
trans_for_each_update(trans, i) {
1485
struct bkey_s_c old = { &i->old_k, i->old_v };
1486
1487
prt_str(buf, "update: btree=");
1488
bch2_btree_id_to_text(buf, i->btree_id);
1489
prt_printf(buf, " cached=%u %pS\n",
1490
i->cached,
1491
(void *) i->ip_allocated);
1492
1493
prt_printf(buf, " old ");
1494
bch2_bkey_val_to_text(buf, trans->c, old);
1495
prt_newline(buf);
1496
1497
prt_printf(buf, " new ");
1498
bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1499
prt_newline(buf);
1500
}
1501
1502
for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
1503
e != btree_trans_journal_entries_top(trans);
1504
e = vstruct_next(e)) {
1505
bch2_journal_entry_to_text(buf, trans->c, e);
1506
prt_newline(buf);
1507
}
1508
1509
printbuf_indent_sub(buf, 2);
1510
}
1511
1512
static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
1513
{
1514
struct btree_path *path = trans->paths + path_idx;
1515
1516
prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
1517
path_idx, path->ref, path->intent_ref,
1518
path->preserve ? 'P' : ' ',
1519
path->should_be_locked ? 'S' : ' ',
1520
path->cached ? 'C' : 'B');
1521
bch2_btree_id_level_to_text(out, path->btree_id, path->level);
1522
prt_str(out, " pos ");
1523
bch2_bpos_to_text(out, path->pos);
1524
1525
if (!path->cached && btree_node_locked(path, path->level)) {
1526
prt_char(out, ' ');
1527
struct btree *b = path_l(path)->b;
1528
bch2_bpos_to_text(out, b->data->min_key);
1529
prt_char(out, '-');
1530
bch2_bpos_to_text(out, b->key.k.p);
1531
}
1532
1533
#ifdef TRACK_PATH_ALLOCATED
1534
prt_printf(out, " %pS", (void *) path->ip_allocated);
1535
#endif
1536
}
1537
1538
static const char *btree_node_locked_str(enum btree_node_locked_type t)
1539
{
1540
switch (t) {
1541
case BTREE_NODE_UNLOCKED:
1542
return "unlocked";
1543
case BTREE_NODE_READ_LOCKED:
1544
return "read";
1545
case BTREE_NODE_INTENT_LOCKED:
1546
return "intent";
1547
case BTREE_NODE_WRITE_LOCKED:
1548
return "write";
1549
default:
1550
return NULL;
1551
}
1552
}
1553
1554
void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
1555
{
1556
bch2_btree_path_to_text_short(out, trans, path_idx);
1557
1558
struct btree_path *path = trans->paths + path_idx;
1559
1560
prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
1561
prt_newline(out);
1562
1563
printbuf_indent_add(out, 2);
1564
for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
1565
prt_printf(out, "l=%u locks %s seq %u node ", l,
1566
btree_node_locked_str(btree_node_locked_type(path, l)),
1567
path->l[l].lock_seq);
1568
1569
int ret = PTR_ERR_OR_ZERO(path->l[l].b);
1570
if (ret)
1571
prt_str(out, bch2_err_str(ret));
1572
else
1573
prt_printf(out, "%px", path->l[l].b);
1574
prt_newline(out);
1575
}
1576
printbuf_indent_sub(out, 2);
1577
}
1578
1579
static noinline __cold
1580
void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
1581
bool nosort)
1582
{
1583
struct trans_for_each_path_inorder_iter iter;
1584
1585
if (!nosort)
1586
btree_trans_sort_paths(trans);
1587
1588
trans_for_each_path_idx_inorder(trans, iter) {
1589
bch2_btree_path_to_text_short(out, trans, iter.path_idx);
1590
prt_newline(out);
1591
}
1592
}
1593
1594
noinline __cold
1595
void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1596
{
1597
__bch2_trans_paths_to_text(out, trans, false);
1598
}
1599
1600
static noinline __cold
1601
void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
1602
{
1603
struct printbuf buf = PRINTBUF;
1604
1605
__bch2_trans_paths_to_text(&buf, trans, nosort);
1606
bch2_trans_updates_to_text(&buf, trans);
1607
1608
bch2_print_str(trans->c, KERN_ERR, buf.buf);
1609
printbuf_exit(&buf);
1610
}
1611
1612
noinline __cold
1613
void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1614
{
1615
__bch2_dump_trans_paths_updates(trans, false);
1616
}
1617
1618
noinline __cold
1619
static void bch2_trans_update_max_paths(struct btree_trans *trans)
1620
{
1621
struct btree_transaction_stats *s = btree_trans_stats(trans);
1622
struct printbuf buf = PRINTBUF;
1623
size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
1624
1625
bch2_trans_paths_to_text(&buf, trans);
1626
1627
if (!buf.allocation_failure) {
1628
mutex_lock(&s->lock);
1629
if (nr > s->nr_max_paths) {
1630
s->nr_max_paths = nr;
1631
swap(s->max_paths_text, buf.buf);
1632
}
1633
mutex_unlock(&s->lock);
1634
}
1635
1636
printbuf_exit(&buf);
1637
1638
trans->nr_paths_max = nr;
1639
}
1640
1641
noinline __cold
1642
int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
1643
{
1644
if (trace_trans_restart_too_many_iters_enabled()) {
1645
struct printbuf buf = PRINTBUF;
1646
1647
bch2_trans_paths_to_text(&buf, trans);
1648
trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
1649
printbuf_exit(&buf);
1650
}
1651
1652
count_event(trans->c, trans_restart_too_many_iters);
1653
1654
return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
1655
}
1656
1657
static noinline void btree_path_overflow(struct btree_trans *trans)
1658
{
1659
bch2_dump_trans_paths_updates(trans);
1660
bch_err(trans->c, "trans path overflow");
1661
}
1662
1663
static noinline void btree_paths_realloc(struct btree_trans *trans)
1664
{
1665
unsigned nr = trans->nr_paths * 2;
1666
1667
void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
1668
sizeof(struct btree_trans_paths) +
1669
nr * sizeof(struct btree_path) +
1670
nr * sizeof(btree_path_idx_t) + 8 +
1671
nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
1672
1673
unsigned long *paths_allocated = p;
1674
memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
1675
p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
1676
1677
p += sizeof(struct btree_trans_paths);
1678
struct btree_path *paths = p;
1679
*trans_paths_nr(paths) = nr;
1680
memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
1681
p += nr * sizeof(struct btree_path);
1682
1683
btree_path_idx_t *sorted = p;
1684
memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
1685
p += nr * sizeof(btree_path_idx_t) + 8;
1686
1687
struct btree_insert_entry *updates = p;
1688
memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
1689
1690
unsigned long *old = trans->paths_allocated;
1691
1692
rcu_assign_pointer(trans->paths_allocated, paths_allocated);
1693
rcu_assign_pointer(trans->paths, paths);
1694
rcu_assign_pointer(trans->sorted, sorted);
1695
rcu_assign_pointer(trans->updates, updates);
1696
1697
trans->nr_paths = nr;
1698
1699
if (old != trans->_paths_allocated)
1700
kfree_rcu_mightsleep(old);
1701
}
1702
1703
static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
1704
btree_path_idx_t pos)
1705
{
1706
btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
1707
1708
if (unlikely(idx == trans->nr_paths)) {
1709
if (trans->nr_paths == BTREE_ITER_MAX) {
1710
btree_path_overflow(trans);
1711
return 0;
1712
}
1713
1714
btree_paths_realloc(trans);
1715
}
1716
1717
/*
1718
* Do this before marking the new path as allocated, since it won't be
1719
* initialized yet:
1720
*/
1721
if (unlikely(idx > trans->nr_paths_max))
1722
bch2_trans_update_max_paths(trans);
1723
1724
__set_bit(idx, trans->paths_allocated);
1725
1726
struct btree_path *path = &trans->paths[idx];
1727
path->ref = 0;
1728
path->intent_ref = 0;
1729
path->nodes_locked = 0;
1730
1731
btree_path_list_add(trans, pos, idx);
1732
trans->paths_sorted = false;
1733
return idx;
1734
}
1735
1736
btree_path_idx_t bch2_path_get(struct btree_trans *trans,
1737
enum btree_id btree_id, struct bpos pos,
1738
unsigned locks_want, unsigned level,
1739
unsigned flags, unsigned long ip)
1740
{
1741
struct btree_path *path;
1742
bool cached = flags & BTREE_ITER_cached;
1743
bool intent = flags & BTREE_ITER_intent;
1744
struct trans_for_each_path_inorder_iter iter;
1745
btree_path_idx_t path_pos = 0, path_idx;
1746
1747
bch2_trans_verify_not_unlocked_or_in_restart(trans);
1748
bch2_trans_verify_locks(trans);
1749
1750
btree_trans_sort_paths(trans);
1751
1752
if (intent)
1753
locks_want = max(locks_want, level + 1);
1754
locks_want = min(locks_want, BTREE_MAX_DEPTH);
1755
1756
trans_for_each_path_inorder(trans, path, iter) {
1757
if (__btree_path_cmp(path,
1758
btree_id,
1759
cached,
1760
pos,
1761
level) > 0)
1762
break;
1763
1764
path_pos = iter.path_idx;
1765
}
1766
1767
if (path_pos &&
1768
trans->paths[path_pos].cached == cached &&
1769
trans->paths[path_pos].btree_id == btree_id &&
1770
trans->paths[path_pos].level == level &&
1771
bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) {
1772
trace_btree_path_get(trans, trans->paths + path_pos, &pos);
1773
1774
__btree_path_get(trans, trans->paths + path_pos, intent);
1775
path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1776
path = trans->paths + path_idx;
1777
} else {
1778
path_idx = btree_path_alloc(trans, path_pos);
1779
path = trans->paths + path_idx;
1780
1781
__btree_path_get(trans, path, intent);
1782
path->pos = pos;
1783
path->btree_id = btree_id;
1784
path->cached = cached;
1785
path->uptodate = BTREE_ITER_NEED_TRAVERSE;
1786
path->should_be_locked = false;
1787
path->level = level;
1788
path->locks_want = locks_want;
1789
path->nodes_locked = 0;
1790
for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
1791
path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
1792
#ifdef TRACK_PATH_ALLOCATED
1793
path->ip_allocated = ip;
1794
#endif
1795
trans->paths_sorted = false;
1796
1797
trace_btree_path_alloc(trans, path);
1798
}
1799
1800
if (!(flags & BTREE_ITER_nopreserve))
1801
path->preserve = true;
1802
1803
/*
1804
* If the path has locks_want greater than requested, we don't downgrade
1805
* it here - on transaction restart because btree node split needs to
1806
* upgrade locks, we might be putting/getting the iterator again.
1807
* Downgrading iterators only happens via bch2_trans_downgrade(), after
1808
* a successful transaction commit.
1809
*/
1810
1811
return path_idx;
1812
}
1813
1814
btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
1815
enum btree_id btree_id,
1816
unsigned level,
1817
struct bpos pos)
1818
{
1819
btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
1820
BTREE_ITER_nopreserve|
1821
BTREE_ITER_intent, _RET_IP_);
1822
path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
1823
1824
struct btree_path *path = trans->paths + path_idx;
1825
bch2_btree_path_downgrade(trans, path);
1826
__bch2_btree_path_unlock(trans, path);
1827
return path_idx;
1828
}
1829
1830
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1831
{
1832
1833
struct btree_path_level *l = path_l(path);
1834
struct bkey_packed *_k;
1835
struct bkey_s_c k;
1836
1837
if (unlikely(!l->b))
1838
return bkey_s_c_null;
1839
1840
EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1841
EBUG_ON(!btree_node_locked(path, path->level));
1842
1843
if (!path->cached) {
1844
_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1845
k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1846
1847
EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
1848
1849
if (!k.k || !bpos_eq(path->pos, k.k->p))
1850
goto hole;
1851
} else {
1852
struct bkey_cached *ck = (void *) path->l[0].b;
1853
if (!ck)
1854
return bkey_s_c_null;
1855
1856
EBUG_ON(path->btree_id != ck->key.btree_id ||
1857
!bkey_eq(path->pos, ck->key.pos));
1858
1859
*u = ck->k->k;
1860
k = (struct bkey_s_c) { u, &ck->k->v };
1861
}
1862
1863
return k;
1864
hole:
1865
bkey_init(u);
1866
u->p = path->pos;
1867
return (struct bkey_s_c) { u, NULL };
1868
}
1869
1870
void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
1871
{
1872
if (!iter->path || trans->restarted)
1873
return;
1874
1875
struct btree_path *path = btree_iter_path(trans, iter);
1876
path->preserve = false;
1877
if (path->ref == 1)
1878
path->should_be_locked = false;
1879
}
1880
/* Btree iterators: */
1881
1882
int __must_check
1883
__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
1884
{
1885
return bch2_btree_path_traverse(trans, iter->path, iter->flags);
1886
}
1887
1888
int __must_check
1889
bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
1890
{
1891
bch2_trans_verify_not_unlocked_or_in_restart(trans);
1892
1893
iter->path = bch2_btree_path_set_pos(trans, iter->path,
1894
btree_iter_search_key(iter),
1895
iter->flags & BTREE_ITER_intent,
1896
btree_iter_ip_allocated(iter));
1897
1898
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1899
if (ret)
1900
return ret;
1901
1902
struct btree_path *path = btree_iter_path(trans, iter);
1903
if (btree_path_node(path, path->level))
1904
btree_path_set_should_be_locked(trans, path);
1905
return 0;
1906
}
1907
1908
/* Iterate across nodes (leaf and interior nodes) */
1909
1910
struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
1911
struct btree_iter *iter)
1912
{
1913
struct btree *b = NULL;
1914
int ret;
1915
1916
EBUG_ON(trans->paths[iter->path].cached);
1917
bch2_btree_iter_verify(trans, iter);
1918
1919
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1920
if (ret)
1921
goto err;
1922
1923
struct btree_path *path = btree_iter_path(trans, iter);
1924
b = btree_path_node(path, path->level);
1925
if (!b)
1926
goto out;
1927
1928
BUG_ON(bpos_lt(b->key.k.p, iter->pos));
1929
1930
bkey_init(&iter->k);
1931
iter->k.p = iter->pos = b->key.k.p;
1932
1933
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1934
iter->flags & BTREE_ITER_intent,
1935
btree_iter_ip_allocated(iter));
1936
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
1937
out:
1938
bch2_btree_iter_verify_entry_exit(iter);
1939
bch2_btree_iter_verify(trans, iter);
1940
1941
return b;
1942
err:
1943
b = ERR_PTR(ret);
1944
goto out;
1945
}
1946
1947
/* Only kept for -tools */
1948
struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
1949
struct btree_iter *iter)
1950
{
1951
struct btree *b;
1952
1953
while (b = bch2_btree_iter_peek_node(trans, iter),
1954
bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
1955
bch2_trans_begin(trans);
1956
1957
return b;
1958
}
1959
1960
struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
1961
{
1962
struct btree *b = NULL;
1963
int ret;
1964
1965
EBUG_ON(trans->paths[iter->path].cached);
1966
bch2_trans_verify_not_unlocked_or_in_restart(trans);
1967
bch2_btree_iter_verify(trans, iter);
1968
1969
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1970
if (ret)
1971
goto err;
1972
1973
1974
struct btree_path *path = btree_iter_path(trans, iter);
1975
1976
/* already at end? */
1977
if (!btree_path_node(path, path->level))
1978
return NULL;
1979
1980
/* got to end? */
1981
if (!btree_path_node(path, path->level + 1)) {
1982
path->should_be_locked = false;
1983
btree_path_set_level_up(trans, path);
1984
return NULL;
1985
}
1986
1987
/*
1988
* We don't correctly handle nodes with extra intent locks here:
1989
* downgrade so we don't violate locking invariants
1990
*/
1991
bch2_btree_path_downgrade(trans, path);
1992
1993
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1994
trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1995
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1996
__bch2_btree_path_unlock(trans, path);
1997
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1998
path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1999
btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
2000
goto err;
2001
}
2002
2003
b = btree_path_node(path, path->level + 1);
2004
2005
if (bpos_eq(iter->pos, b->key.k.p)) {
2006
__btree_path_set_level_up(trans, path, path->level++);
2007
} else {
2008
if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
2009
btree_node_unlock(trans, path, path->level + 1);
2010
2011
/*
2012
* Haven't gotten to the end of the parent node: go back down to
2013
* the next child node
2014
*/
2015
iter->path = bch2_btree_path_set_pos(trans, iter->path,
2016
bpos_successor(iter->pos),
2017
iter->flags & BTREE_ITER_intent,
2018
btree_iter_ip_allocated(iter));
2019
2020
path = btree_iter_path(trans, iter);
2021
btree_path_set_level_down(trans, path, iter->min_depth);
2022
2023
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2024
if (ret)
2025
goto err;
2026
2027
path = btree_iter_path(trans, iter);
2028
b = path->l[path->level].b;
2029
}
2030
2031
bkey_init(&iter->k);
2032
iter->k.p = iter->pos = b->key.k.p;
2033
2034
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
2035
iter->flags & BTREE_ITER_intent,
2036
btree_iter_ip_allocated(iter));
2037
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
2038
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
2039
out:
2040
bch2_btree_iter_verify_entry_exit(iter);
2041
bch2_btree_iter_verify(trans, iter);
2042
2043
return b;
2044
err:
2045
b = ERR_PTR(ret);
2046
goto out;
2047
}
2048
2049
/* Iterate across keys (in leaf nodes only) */
2050
2051
inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
2052
{
2053
struct bpos pos = iter->k.p;
2054
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
2055
? bpos_eq(pos, SPOS_MAX)
2056
: bkey_eq(pos, SPOS_MAX));
2057
2058
if (ret && !(iter->flags & BTREE_ITER_is_extents))
2059
pos = bkey_successor(iter, pos);
2060
bch2_btree_iter_set_pos(trans, iter, pos);
2061
return ret;
2062
}
2063
2064
inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
2065
{
2066
struct bpos pos = bkey_start_pos(&iter->k);
2067
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
2068
? bpos_eq(pos, POS_MIN)
2069
: bkey_eq(pos, POS_MIN));
2070
2071
if (ret && !(iter->flags & BTREE_ITER_is_extents))
2072
pos = bkey_predecessor(iter, pos);
2073
bch2_btree_iter_set_pos(trans, iter, pos);
2074
return ret;
2075
}
2076
2077
static noinline
2078
void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
2079
struct bpos search_key, struct bkey_s_c *k)
2080
{
2081
struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
2082
2083
trans_for_each_update(trans, i)
2084
if (!i->key_cache_already_flushed &&
2085
i->btree_id == iter->btree_id &&
2086
bpos_le(i->k->k.p, search_key) &&
2087
bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
2088
iter->k = i->k->k;
2089
*k = bkey_i_to_s_c(i->k);
2090
}
2091
}
2092
2093
static noinline
2094
void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
2095
struct bpos search_key,
2096
struct bkey_s_c *k)
2097
{
2098
struct btree_path *path = btree_iter_path(trans, iter);
2099
struct bpos end = path_l(path)->b->key.k.p;
2100
2101
trans_for_each_update(trans, i)
2102
if (!i->key_cache_already_flushed &&
2103
i->btree_id == iter->btree_id &&
2104
bpos_ge(i->k->k.p, search_key) &&
2105
bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
2106
iter->k = i->k->k;
2107
*k = bkey_i_to_s_c(i->k);
2108
}
2109
}
2110
2111
static noinline
2112
void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
2113
struct bkey_s_c *k)
2114
{
2115
trans_for_each_update(trans, i)
2116
if (!i->key_cache_already_flushed &&
2117
i->btree_id == iter->btree_id &&
2118
bpos_eq(i->k->k.p, iter->pos)) {
2119
iter->k = i->k->k;
2120
*k = bkey_i_to_s_c(i->k);
2121
}
2122
}
2123
2124
static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
2125
struct btree_iter *iter,
2126
struct bpos search_pos,
2127
struct bpos end_pos)
2128
{
2129
struct btree_path *path = btree_iter_path(trans, iter);
2130
2131
return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
2132
path->level,
2133
search_pos,
2134
end_pos,
2135
&iter->journal_idx);
2136
}
2137
2138
static noinline
2139
struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
2140
struct btree_iter *iter)
2141
{
2142
struct btree_path *path = btree_iter_path(trans, iter);
2143
struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
2144
2145
if (k) {
2146
iter->k = k->k;
2147
return bkey_i_to_s_c(k);
2148
} else {
2149
return bkey_s_c_null;
2150
}
2151
}
2152
2153
static noinline
2154
void btree_trans_peek_journal(struct btree_trans *trans,
2155
struct btree_iter *iter,
2156
struct bpos search_key,
2157
struct bkey_s_c *k)
2158
{
2159
struct btree_path *path = btree_iter_path(trans, iter);
2160
struct bkey_i *next_journal =
2161
bch2_btree_journal_peek(trans, iter, search_key,
2162
k->k ? k->k->p : path_l(path)->b->key.k.p);
2163
if (next_journal) {
2164
iter->k = next_journal->k;
2165
*k = bkey_i_to_s_c(next_journal);
2166
}
2167
}
2168
2169
static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
2170
struct btree_iter *iter,
2171
struct bpos search_key,
2172
struct bpos end_pos)
2173
{
2174
struct btree_path *path = btree_iter_path(trans, iter);
2175
2176
return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
2177
path->level,
2178
search_key,
2179
end_pos,
2180
&iter->journal_idx);
2181
}
2182
2183
static noinline
2184
void btree_trans_peek_prev_journal(struct btree_trans *trans,
2185
struct btree_iter *iter,
2186
struct bpos search_key,
2187
struct bkey_s_c *k)
2188
{
2189
struct btree_path *path = btree_iter_path(trans, iter);
2190
struct bkey_i *next_journal =
2191
bch2_btree_journal_peek_prev(trans, iter, search_key,
2192
k->k ? k->k->p : path_l(path)->b->data->min_key);
2193
2194
if (next_journal) {
2195
iter->k = next_journal->k;
2196
*k = bkey_i_to_s_c(next_journal);
2197
}
2198
}
2199
2200
/*
2201
* Checks btree key cache for key at iter->pos and returns it if present, or
2202
* bkey_s_c_null:
2203
*/
2204
static noinline
2205
struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
2206
struct bpos pos)
2207
{
2208
struct bch_fs *c = trans->c;
2209
struct bkey u;
2210
struct bkey_s_c k;
2211
int ret;
2212
2213
bch2_trans_verify_not_unlocked_or_in_restart(trans);
2214
2215
if ((iter->flags & BTREE_ITER_key_cache_fill) &&
2216
bpos_eq(iter->pos, pos))
2217
return bkey_s_c_null;
2218
2219
if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
2220
return bkey_s_c_null;
2221
2222
if (!iter->key_cache_path)
2223
iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
2224
iter->flags & BTREE_ITER_intent, 0,
2225
iter->flags|BTREE_ITER_cached|
2226
BTREE_ITER_cached_nofill,
2227
_THIS_IP_);
2228
2229
iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
2230
iter->flags & BTREE_ITER_intent,
2231
btree_iter_ip_allocated(iter));
2232
2233
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
2234
iter->flags|BTREE_ITER_cached) ?:
2235
bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
2236
if (unlikely(ret))
2237
return bkey_s_c_err(ret);
2238
2239
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
2240
if (!k.k)
2241
return k;
2242
2243
if ((iter->flags & BTREE_ITER_all_snapshots) &&
2244
!bpos_eq(pos, k.k->p))
2245
return bkey_s_c_null;
2246
2247
iter->k = u;
2248
k.k = &iter->k;
2249
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
2250
return k;
2251
}
2252
2253
static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
2254
struct bpos search_key)
2255
{
2256
struct bkey_s_c k, k2;
2257
int ret;
2258
2259
EBUG_ON(btree_iter_path(trans, iter)->cached);
2260
bch2_btree_iter_verify(trans, iter);
2261
2262
while (1) {
2263
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2264
iter->flags & BTREE_ITER_intent,
2265
btree_iter_ip_allocated(iter));
2266
2267
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2268
if (unlikely(ret)) {
2269
/* ensure that iter->k is consistent with iter->pos: */
2270
bch2_btree_iter_set_pos(trans, iter, iter->pos);
2271
k = bkey_s_c_err(ret);
2272
break;
2273
}
2274
2275
struct btree_path *path = btree_iter_path(trans, iter);
2276
struct btree_path_level *l = path_l(path);
2277
2278
if (unlikely(!l->b)) {
2279
/* No btree nodes at requested level: */
2280
bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
2281
k = bkey_s_c_null;
2282
break;
2283
}
2284
2285
btree_path_set_should_be_locked(trans, path);
2286
2287
k = btree_path_level_peek_all(trans->c, l, &iter->k);
2288
2289
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
2290
k.k &&
2291
(k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
2292
k = k2;
2293
if (bkey_err(k)) {
2294
bch2_btree_iter_set_pos(trans, iter, iter->pos);
2295
break;
2296
}
2297
}
2298
2299
if (unlikely(iter->flags & BTREE_ITER_with_journal))
2300
btree_trans_peek_journal(trans, iter, search_key, &k);
2301
2302
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
2303
trans->nr_updates))
2304
bch2_btree_trans_peek_updates(trans, iter, search_key, &k);
2305
2306
if (k.k && bkey_deleted(k.k)) {
2307
/*
2308
* If we've got a whiteout, and it's after the search
2309
* key, advance the search key to the whiteout instead
2310
* of just after the whiteout - it might be a btree
2311
* whiteout, with a real key at the same position, since
2312
* in the btree deleted keys sort before non deleted.
2313
*/
2314
search_key = !bpos_eq(search_key, k.k->p)
2315
? k.k->p
2316
: bpos_successor(k.k->p);
2317
continue;
2318
}
2319
2320
if (likely(k.k)) {
2321
break;
2322
} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
2323
/* Advance to next leaf node: */
2324
search_key = bpos_successor(l->b->key.k.p);
2325
} else {
2326
/* End of btree: */
2327
bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
2328
k = bkey_s_c_null;
2329
break;
2330
}
2331
}
2332
2333
bch2_btree_iter_verify(trans, iter);
2334
2335
if (trace___btree_iter_peek_enabled()) {
2336
CLASS(printbuf, buf)();
2337
2338
int ret = bkey_err(k);
2339
if (ret)
2340
prt_str(&buf, bch2_err_str(ret));
2341
else if (k.k)
2342
bch2_bkey_val_to_text(&buf, trans->c, k);
2343
else
2344
prt_str(&buf, "(null)");
2345
trace___btree_iter_peek(trans->c, buf.buf);
2346
}
2347
2348
return k;
2349
}
2350
2351
/**
2352
* bch2_btree_iter_peek_max() - returns first key greater than or equal to
2353
* iterator's current position
2354
* @trans: btree transaction object
2355
* @iter: iterator to peek from
2356
* @end: search limit: returns keys less than or equal to @end
2357
*
2358
* Returns: key if found, or an error extractable with bkey_err().
2359
*/
2360
struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
2361
struct bpos end)
2362
{
2363
struct bpos search_key = btree_iter_search_key(iter);
2364
struct bkey_s_c k;
2365
struct bpos iter_pos = iter->pos;
2366
int ret;
2367
2368
bch2_trans_verify_not_unlocked_or_in_restart(trans);
2369
bch2_btree_iter_verify_entry_exit(iter);
2370
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
2371
2372
ret = trans_maybe_inject_restart(trans, _RET_IP_);
2373
if (unlikely(ret)) {
2374
k = bkey_s_c_err(ret);
2375
goto out_no_locked;
2376
}
2377
2378
if (iter->update_path) {
2379
bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent);
2380
iter->update_path = 0;
2381
}
2382
2383
while (1) {
2384
k = __bch2_btree_iter_peek(trans, iter, search_key);
2385
if (unlikely(!k.k))
2386
goto end;
2387
if (unlikely(bkey_err(k)))
2388
goto out_no_locked;
2389
2390
if (iter->flags & BTREE_ITER_filter_snapshots) {
2391
/*
2392
* We need to check against @end before FILTER_SNAPSHOTS because
2393
* if we get to a different inode that requested we might be
2394
* seeing keys for a different snapshot tree that will all be
2395
* filtered out.
2396
*
2397
* But we can't do the full check here, because bkey_start_pos()
2398
* isn't monotonically increasing before FILTER_SNAPSHOTS, and
2399
* that's what we check against in extents mode:
2400
*/
2401
if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
2402
? bkey_gt(k.k->p, end)
2403
: k.k->p.inode > end.inode))
2404
goto end;
2405
2406
if (iter->update_path &&
2407
!bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
2408
bch2_path_put(trans, iter->update_path,
2409
iter->flags & BTREE_ITER_intent);
2410
iter->update_path = 0;
2411
}
2412
2413
if ((iter->flags & BTREE_ITER_intent) &&
2414
!(iter->flags & BTREE_ITER_is_extents) &&
2415
!iter->update_path) {
2416
struct bpos pos = k.k->p;
2417
2418
if (pos.snapshot < iter->snapshot) {
2419
search_key = bpos_successor(k.k->p);
2420
continue;
2421
}
2422
2423
pos.snapshot = iter->snapshot;
2424
2425
/*
2426
* advance, same as on exit for iter->path, but only up
2427
* to snapshot
2428
*/
2429
__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
2430
iter->update_path = iter->path;
2431
2432
iter->update_path = bch2_btree_path_set_pos(trans,
2433
iter->update_path, pos,
2434
iter->flags & BTREE_ITER_intent,
2435
_THIS_IP_);
2436
ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
2437
if (unlikely(ret)) {
2438
k = bkey_s_c_err(ret);
2439
goto out_no_locked;
2440
}
2441
}
2442
2443
/*
2444
* We can never have a key in a leaf node at POS_MAX, so
2445
* we don't have to check these successor() calls:
2446
*/
2447
if (!bch2_snapshot_is_ancestor(trans->c,
2448
iter->snapshot,
2449
k.k->p.snapshot)) {
2450
search_key = bpos_successor(k.k->p);
2451
continue;
2452
}
2453
2454
if (bkey_whiteout(k.k) &&
2455
!(iter->flags & BTREE_ITER_key_cache_fill)) {
2456
search_key = bkey_successor(iter, k.k->p);
2457
continue;
2458
}
2459
}
2460
2461
/*
2462
* iter->pos should be mononotically increasing, and always be
2463
* equal to the key we just returned - except extents can
2464
* straddle iter->pos:
2465
*/
2466
if (!(iter->flags & BTREE_ITER_is_extents))
2467
iter_pos = k.k->p;
2468
else
2469
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
2470
2471
if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
2472
iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
2473
bkey_gt(iter_pos, end)))
2474
goto end;
2475
2476
break;
2477
}
2478
2479
iter->pos = iter_pos;
2480
2481
iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2482
iter->flags & BTREE_ITER_intent,
2483
btree_iter_ip_allocated(iter));
2484
2485
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
2486
out_no_locked:
2487
if (iter->update_path) {
2488
ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
2489
if (unlikely(ret))
2490
k = bkey_s_c_err(ret);
2491
else
2492
btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
2493
}
2494
2495
if (!(iter->flags & BTREE_ITER_all_snapshots))
2496
iter->pos.snapshot = iter->snapshot;
2497
2498
ret = bch2_btree_iter_verify_ret(trans, iter, k);
2499
if (unlikely(ret)) {
2500
bch2_btree_iter_set_pos(trans, iter, iter->pos);
2501
k = bkey_s_c_err(ret);
2502
}
2503
2504
bch2_btree_iter_verify_entry_exit(iter);
2505
2506
if (trace_btree_iter_peek_max_enabled()) {
2507
CLASS(printbuf, buf)();
2508
2509
int ret = bkey_err(k);
2510
if (ret)
2511
prt_str(&buf, bch2_err_str(ret));
2512
else if (k.k)
2513
bch2_bkey_val_to_text(&buf, trans->c, k);
2514
else
2515
prt_str(&buf, "(null)");
2516
trace_btree_iter_peek_max(trans->c, buf.buf);
2517
}
2518
2519
return k;
2520
end:
2521
bch2_btree_iter_set_pos(trans, iter, end);
2522
k = bkey_s_c_null;
2523
goto out_no_locked;
2524
}
2525
2526
/**
2527
* bch2_btree_iter_next() - returns first key greater than iterator's current
2528
* position
2529
* @trans: btree transaction object
2530
* @iter: iterator to peek from
2531
*
2532
* Returns: key if found, or an error extractable with bkey_err().
2533
*/
2534
struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
2535
{
2536
if (!bch2_btree_iter_advance(trans, iter))
2537
return bkey_s_c_null;
2538
2539
return bch2_btree_iter_peek(trans, iter);
2540
}
2541
2542
static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
2543
struct bpos search_key)
2544
{
2545
struct bkey_s_c k, k2;
2546
2547
bch2_btree_iter_verify(trans, iter);
2548
2549
while (1) {
2550
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2551
iter->flags & BTREE_ITER_intent,
2552
btree_iter_ip_allocated(iter));
2553
2554
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2555
if (unlikely(ret)) {
2556
/* ensure that iter->k is consistent with iter->pos: */
2557
bch2_btree_iter_set_pos(trans, iter, iter->pos);
2558
k = bkey_s_c_err(ret);
2559
break;
2560
}
2561
2562
struct btree_path *path = btree_iter_path(trans, iter);
2563
struct btree_path_level *l = path_l(path);
2564
2565
if (unlikely(!l->b)) {
2566
/* No btree nodes at requested level: */
2567
bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
2568
k = bkey_s_c_null;
2569
break;
2570
}
2571
2572
btree_path_set_should_be_locked(trans, path);
2573
2574
k = btree_path_level_peek_all(trans->c, l, &iter->k);
2575
if (!k.k || bpos_gt(k.k->p, search_key)) {
2576
k = btree_path_level_prev(trans, path, l, &iter->k);
2577
2578
BUG_ON(k.k && bpos_gt(k.k->p, search_key));
2579
}
2580
2581
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
2582
k.k &&
2583
(k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
2584
k = k2;
2585
if (bkey_err(k2)) {
2586
bch2_btree_iter_set_pos(trans, iter, iter->pos);
2587
break;
2588
}
2589
}
2590
2591
if (unlikely(iter->flags & BTREE_ITER_with_journal))
2592
btree_trans_peek_prev_journal(trans, iter, search_key, &k);
2593
2594
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
2595
trans->nr_updates))
2596
bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k);
2597
2598
if (likely(k.k && !bkey_deleted(k.k))) {
2599
break;
2600
} else if (k.k) {
2601
search_key = bpos_predecessor(k.k->p);
2602
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
2603
/* Advance to previous leaf node: */
2604
search_key = bpos_predecessor(path->l[0].b->data->min_key);
2605
} else {
2606
/* Start of btree: */
2607
bch2_btree_iter_set_pos(trans, iter, POS_MIN);
2608
k = bkey_s_c_null;
2609
break;
2610
}
2611
}
2612
2613
bch2_btree_iter_verify(trans, iter);
2614
return k;
2615
}
2616
2617
/**
2618
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
2619
* iterator's current position
2620
* @trans: btree transaction object
2621
* @iter: iterator to peek from
2622
* @end: search limit: returns keys greater than or equal to @end
2623
*
2624
* Returns: key if found, or an error extractable with bkey_err().
2625
*/
2626
struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
2627
struct bpos end)
2628
{
2629
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
2630
!bkey_eq(iter->pos, POS_MAX) &&
2631
!((iter->flags & BTREE_ITER_is_extents) &&
2632
iter->pos.offset == U64_MAX)) {
2633
2634
/*
2635
* bkey_start_pos(), for extents, is not monotonically
2636
* increasing until after filtering for snapshots:
2637
*
2638
* Thus, for extents we need to search forward until we find a
2639
* real visible extents - easiest to just use peek_slot() (which
2640
* internally uses peek() for extents)
2641
*/
2642
struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
2643
if (bkey_err(k))
2644
return k;
2645
2646
if (!bkey_deleted(k.k) &&
2647
(!(iter->flags & BTREE_ITER_is_extents) ||
2648
bkey_lt(bkey_start_pos(k.k), iter->pos)))
2649
return k;
2650
}
2651
2652
struct bpos search_key = iter->pos;
2653
struct bkey_s_c k;
2654
btree_path_idx_t saved_path = 0;
2655
2656
bch2_trans_verify_not_unlocked_or_in_restart(trans);
2657
bch2_btree_iter_verify_entry_exit(iter);
2658
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
2659
2660
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
2661
if (unlikely(ret)) {
2662
k = bkey_s_c_err(ret);
2663
goto out_no_locked;
2664
}
2665
2666
while (1) {
2667
k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
2668
if (unlikely(!k.k))
2669
goto end;
2670
if (unlikely(bkey_err(k)))
2671
goto out_no_locked;
2672
2673
if (iter->flags & BTREE_ITER_filter_snapshots) {
2674
struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
2675
if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
2676
/*
2677
* If we have a saved candidate, and we're past
2678
* the last possible snapshot overwrite, return
2679
* it:
2680
*/
2681
bch2_path_put(trans, iter->path,
2682
iter->flags & BTREE_ITER_intent);
2683
iter->path = saved_path;
2684
saved_path = 0;
2685
k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
2686
break;
2687
}
2688
2689
/*
2690
* We need to check against @end before FILTER_SNAPSHOTS because
2691
* if we get to a different inode that requested we might be
2692
* seeing keys for a different snapshot tree that will all be
2693
* filtered out.
2694
*/
2695
if (unlikely(bkey_lt(k.k->p, end)))
2696
goto end;
2697
2698
if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
2699
search_key = bpos_predecessor(k.k->p);
2700
continue;
2701
}
2702
2703
if (k.k->p.snapshot != iter->snapshot) {
2704
/*
2705
* Have a key visible in iter->snapshot, but
2706
* might have overwrites: - save it and keep
2707
* searching. Unless it's a whiteout - then drop
2708
* our previous saved candidate:
2709
*/
2710
if (saved_path) {
2711
bch2_path_put(trans, saved_path,
2712
iter->flags & BTREE_ITER_intent);
2713
saved_path = 0;
2714
}
2715
2716
if (!bkey_whiteout(k.k)) {
2717
saved_path = btree_path_clone(trans, iter->path,
2718
iter->flags & BTREE_ITER_intent,
2719
_THIS_IP_);
2720
trace_btree_path_save_pos(trans,
2721
trans->paths + iter->path,
2722
trans->paths + saved_path);
2723
}
2724
2725
search_key = bpos_predecessor(k.k->p);
2726
continue;
2727
}
2728
2729
if (bkey_whiteout(k.k)) {
2730
search_key = bkey_predecessor(iter, k.k->p);
2731
search_key.snapshot = U32_MAX;
2732
continue;
2733
}
2734
}
2735
2736
EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
2737
iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
2738
bkey_gt(k.k->p, iter->pos));
2739
2740
if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
2741
iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
2742
bkey_lt(k.k->p, end)))
2743
goto end;
2744
2745
break;
2746
}
2747
2748
/* Extents can straddle iter->pos: */
2749
iter->pos = bpos_min(iter->pos, k.k->p);;
2750
2751
if (iter->flags & BTREE_ITER_filter_snapshots)
2752
iter->pos.snapshot = iter->snapshot;
2753
out_no_locked:
2754
if (saved_path)
2755
bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent);
2756
2757
bch2_btree_iter_verify_entry_exit(iter);
2758
bch2_btree_iter_verify(trans, iter);
2759
2760
if (trace_btree_iter_peek_prev_min_enabled()) {
2761
CLASS(printbuf, buf)();
2762
2763
int ret = bkey_err(k);
2764
if (ret)
2765
prt_str(&buf, bch2_err_str(ret));
2766
else if (k.k)
2767
bch2_bkey_val_to_text(&buf, trans->c, k);
2768
else
2769
prt_str(&buf, "(null)");
2770
trace_btree_iter_peek_prev_min(trans->c, buf.buf);
2771
}
2772
return k;
2773
end:
2774
bch2_btree_iter_set_pos(trans, iter, end);
2775
k = bkey_s_c_null;
2776
goto out_no_locked;
2777
}
2778
2779
/**
2780
* bch2_btree_iter_prev() - returns first key less than iterator's current
2781
* position
2782
* @trans: btree transaction object
2783
* @iter: iterator to peek from
2784
*
2785
* Returns: key if found, or an error extractable with bkey_err().
2786
*/
2787
struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
2788
{
2789
if (!bch2_btree_iter_rewind(trans, iter))
2790
return bkey_s_c_null;
2791
2792
return bch2_btree_iter_peek_prev(trans, iter);
2793
}
2794
2795
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
2796
{
2797
struct bpos search_key;
2798
struct bkey_s_c k;
2799
int ret;
2800
2801
bch2_trans_verify_not_unlocked_or_in_restart(trans);
2802
bch2_btree_iter_verify(trans, iter);
2803
bch2_btree_iter_verify_entry_exit(iter);
2804
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
2805
2806
ret = trans_maybe_inject_restart(trans, _RET_IP_);
2807
if (unlikely(ret)) {
2808
k = bkey_s_c_err(ret);
2809
goto out;
2810
}
2811
2812
/* extents can't span inode numbers: */
2813
if ((iter->flags & BTREE_ITER_is_extents) &&
2814
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2815
if (iter->pos.inode == KEY_INODE_MAX) {
2816
k = bkey_s_c_null;
2817
goto out2;
2818
}
2819
2820
bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
2821
}
2822
2823
search_key = btree_iter_search_key(iter);
2824
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2825
iter->flags & BTREE_ITER_intent,
2826
btree_iter_ip_allocated(iter));
2827
2828
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2829
if (unlikely(ret)) {
2830
k = bkey_s_c_err(ret);
2831
goto out;
2832
}
2833
2834
struct btree_path *path = btree_iter_path(trans, iter);
2835
if (unlikely(!btree_path_node(path, path->level))) {
2836
k = bkey_s_c_null;
2837
goto out2;
2838
}
2839
2840
btree_path_set_should_be_locked(trans, path);
2841
2842
if ((iter->flags & BTREE_ITER_cached) ||
2843
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
2844
k = bkey_s_c_null;
2845
2846
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
2847
trans->nr_updates)) {
2848
bch2_btree_trans_peek_slot_updates(trans, iter, &k);
2849
if (k.k)
2850
goto out;
2851
}
2852
2853
if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
2854
(k = btree_trans_peek_slot_journal(trans, iter)).k)
2855
goto out;
2856
2857
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
2858
(k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
2859
if (!bkey_err(k))
2860
iter->k = *k.k;
2861
/* We're not returning a key from iter->path: */
2862
goto out;
2863
}
2864
2865
k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
2866
if (unlikely(!k.k))
2867
goto out;
2868
2869
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
2870
(iter->flags & BTREE_ITER_filter_snapshots) &&
2871
!(iter->flags & BTREE_ITER_key_cache_fill)))
2872
iter->k.type = KEY_TYPE_deleted;
2873
} else {
2874
struct bpos next;
2875
struct bpos end = iter->pos;
2876
2877
if (iter->flags & BTREE_ITER_is_extents)
2878
end.offset = U64_MAX;
2879
2880
EBUG_ON(btree_iter_path(trans, iter)->level);
2881
2882
if (iter->flags & BTREE_ITER_intent) {
2883
struct btree_iter iter2;
2884
2885
bch2_trans_copy_iter(trans, &iter2, iter);
2886
k = bch2_btree_iter_peek_max(trans, &iter2, end);
2887
2888
if (k.k && !bkey_err(k)) {
2889
swap(iter->key_cache_path, iter2.key_cache_path);
2890
iter->k = iter2.k;
2891
k.k = &iter->k;
2892
}
2893
bch2_trans_iter_exit(trans, &iter2);
2894
} else {
2895
struct bpos pos = iter->pos;
2896
2897
k = bch2_btree_iter_peek_max(trans, iter, end);
2898
if (unlikely(bkey_err(k)))
2899
bch2_btree_iter_set_pos(trans, iter, pos);
2900
else
2901
iter->pos = pos;
2902
}
2903
2904
if (unlikely(bkey_err(k)))
2905
goto out;
2906
2907
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2908
2909
if (bkey_lt(iter->pos, next)) {
2910
bkey_init(&iter->k);
2911
iter->k.p = iter->pos;
2912
2913
if (iter->flags & BTREE_ITER_is_extents) {
2914
bch2_key_resize(&iter->k,
2915
min_t(u64, KEY_SIZE_MAX,
2916
(next.inode == iter->pos.inode
2917
? next.offset
2918
: KEY_OFFSET_MAX) -
2919
iter->pos.offset));
2920
EBUG_ON(!iter->k.size);
2921
}
2922
2923
k = (struct bkey_s_c) { &iter->k, NULL };
2924
}
2925
}
2926
out:
2927
bch2_btree_iter_verify_entry_exit(iter);
2928
bch2_btree_iter_verify(trans, iter);
2929
ret = bch2_btree_iter_verify_ret(trans, iter, k);
2930
if (unlikely(ret))
2931
k = bkey_s_c_err(ret);
2932
out2:
2933
if (trace_btree_iter_peek_slot_enabled()) {
2934
CLASS(printbuf, buf)();
2935
2936
int ret = bkey_err(k);
2937
if (ret)
2938
prt_str(&buf, bch2_err_str(ret));
2939
else if (k.k)
2940
bch2_bkey_val_to_text(&buf, trans->c, k);
2941
else
2942
prt_str(&buf, "(null)");
2943
trace_btree_iter_peek_slot(trans->c, buf.buf);
2944
}
2945
2946
return k;
2947
}
2948
2949
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
2950
{
2951
if (!bch2_btree_iter_advance(trans, iter))
2952
return bkey_s_c_null;
2953
2954
return bch2_btree_iter_peek_slot(trans, iter);
2955
}
2956
2957
struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
2958
{
2959
if (!bch2_btree_iter_rewind(trans, iter))
2960
return bkey_s_c_null;
2961
2962
return bch2_btree_iter_peek_slot(trans, iter);
2963
}
2964
2965
/* Obsolete, but still used by rust wrapper in -tools */
2966
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
2967
{
2968
struct bkey_s_c k;
2969
2970
while (btree_trans_too_many_iters(trans) ||
2971
(k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
2972
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
2973
bch2_trans_begin(trans);
2974
2975
return k;
2976
}
2977
2978
/* new transactional stuff: */
2979
2980
#ifdef CONFIG_BCACHEFS_DEBUG
2981
static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2982
{
2983
struct btree_path *path;
2984
unsigned i;
2985
2986
BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
2987
2988
trans_for_each_path(trans, path, i) {
2989
BUG_ON(path->sorted_idx >= trans->nr_sorted);
2990
BUG_ON(trans->sorted[path->sorted_idx] != i);
2991
}
2992
2993
for (i = 0; i < trans->nr_sorted; i++) {
2994
unsigned idx = trans->sorted[i];
2995
2996
BUG_ON(!test_bit(idx, trans->paths_allocated));
2997
BUG_ON(trans->paths[idx].sorted_idx != i);
2998
}
2999
}
3000
3001
static void btree_trans_verify_sorted(struct btree_trans *trans)
3002
{
3003
struct btree_path *path, *prev = NULL;
3004
struct trans_for_each_path_inorder_iter iter;
3005
3006
if (!static_branch_unlikely(&bch2_debug_check_iterators))
3007
return;
3008
3009
trans_for_each_path_inorder(trans, path, iter) {
3010
if (prev && btree_path_cmp(prev, path) > 0) {
3011
__bch2_dump_trans_paths_updates(trans, true);
3012
panic("trans paths out of order!\n");
3013
}
3014
prev = path;
3015
}
3016
}
3017
#else
3018
static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
3019
static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
3020
#endif
3021
3022
void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
3023
{
3024
int i, l = 0, r = trans->nr_sorted, inc = 1;
3025
bool swapped;
3026
3027
btree_trans_verify_sorted_refs(trans);
3028
3029
if (trans->paths_sorted)
3030
goto out;
3031
3032
/*
3033
* Cocktail shaker sort: this is efficient because iterators will be
3034
* mostly sorted.
3035
*/
3036
do {
3037
swapped = false;
3038
3039
for (i = inc > 0 ? l : r - 2;
3040
i + 1 < r && i >= l;
3041
i += inc) {
3042
if (btree_path_cmp(trans->paths + trans->sorted[i],
3043
trans->paths + trans->sorted[i + 1]) > 0) {
3044
swap(trans->sorted[i], trans->sorted[i + 1]);
3045
trans->paths[trans->sorted[i]].sorted_idx = i;
3046
trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
3047
swapped = true;
3048
}
3049
}
3050
3051
if (inc > 0)
3052
--r;
3053
else
3054
l++;
3055
inc = -inc;
3056
} while (swapped);
3057
3058
trans->paths_sorted = true;
3059
out:
3060
btree_trans_verify_sorted(trans);
3061
}
3062
3063
static inline void btree_path_list_remove(struct btree_trans *trans,
3064
struct btree_path *path)
3065
{
3066
EBUG_ON(path->sorted_idx >= trans->nr_sorted);
3067
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
3068
trans->nr_sorted--;
3069
memmove_u64s_down_small(trans->sorted + path->sorted_idx,
3070
trans->sorted + path->sorted_idx + 1,
3071
DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
3072
sizeof(u64) / sizeof(btree_path_idx_t)));
3073
#else
3074
array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
3075
#endif
3076
for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
3077
trans->paths[trans->sorted[i]].sorted_idx = i;
3078
}
3079
3080
static inline void btree_path_list_add(struct btree_trans *trans,
3081
btree_path_idx_t pos,
3082
btree_path_idx_t path_idx)
3083
{
3084
struct btree_path *path = trans->paths + path_idx;
3085
3086
path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
3087
3088
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
3089
memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
3090
trans->sorted + path->sorted_idx,
3091
DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
3092
sizeof(u64) / sizeof(btree_path_idx_t)));
3093
trans->nr_sorted++;
3094
trans->sorted[path->sorted_idx] = path_idx;
3095
#else
3096
array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
3097
#endif
3098
3099
for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
3100
trans->paths[trans->sorted[i]].sorted_idx = i;
3101
3102
btree_trans_verify_sorted_refs(trans);
3103
}
3104
3105
void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
3106
{
3107
if (iter->update_path)
3108
bch2_path_put(trans, iter->update_path,
3109
iter->flags & BTREE_ITER_intent);
3110
if (iter->path)
3111
bch2_path_put(trans, iter->path,
3112
iter->flags & BTREE_ITER_intent);
3113
if (iter->key_cache_path)
3114
bch2_path_put(trans, iter->key_cache_path,
3115
iter->flags & BTREE_ITER_intent);
3116
iter->path = 0;
3117
iter->update_path = 0;
3118
iter->key_cache_path = 0;
3119
}
3120
3121
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
3122
struct btree_iter *iter,
3123
enum btree_id btree_id, struct bpos pos,
3124
unsigned flags)
3125
{
3126
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
3127
bch2_btree_iter_flags(trans, btree_id, 0, flags),
3128
_RET_IP_);
3129
}
3130
3131
void bch2_trans_node_iter_init(struct btree_trans *trans,
3132
struct btree_iter *iter,
3133
enum btree_id btree_id,
3134
struct bpos pos,
3135
unsigned locks_want,
3136
unsigned depth,
3137
unsigned flags)
3138
{
3139
flags |= BTREE_ITER_not_extents;
3140
flags |= BTREE_ITER_snapshot_field;
3141
flags |= BTREE_ITER_all_snapshots;
3142
3143
if (!depth && btree_id_cached(trans->c, btree_id))
3144
flags |= BTREE_ITER_with_key_cache;
3145
3146
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
3147
bch2_btree_iter_flags(trans, btree_id, depth, flags),
3148
_RET_IP_);
3149
3150
iter->min_depth = depth;
3151
3152
struct btree_path *path = btree_iter_path(trans, iter);
3153
BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
3154
BUG_ON(path->level != depth);
3155
BUG_ON(iter->min_depth != depth);
3156
}
3157
3158
void bch2_trans_copy_iter(struct btree_trans *trans,
3159
struct btree_iter *dst, struct btree_iter *src)
3160
{
3161
*dst = *src;
3162
#ifdef TRACK_PATH_ALLOCATED
3163
dst->ip_allocated = _RET_IP_;
3164
#endif
3165
if (src->path)
3166
__btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
3167
if (src->update_path)
3168
__btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
3169
dst->key_cache_path = 0;
3170
}
3171
3172
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
3173
void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
3174
darray_trans_kmalloc_trace *trace)
3175
{
3176
printbuf_tabstops_reset(out);
3177
printbuf_tabstop_push(out, 60);
3178
3179
darray_for_each(*trace, i)
3180
prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
3181
}
3182
#endif
3183
3184
void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
3185
{
3186
struct bch_fs *c = trans->c;
3187
unsigned new_top = trans->mem_top + size;
3188
unsigned old_bytes = trans->mem_bytes;
3189
unsigned new_bytes = roundup_pow_of_two(new_top);
3190
int ret;
3191
void *new_mem;
3192
void *p;
3193
3194
if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
3195
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
3196
struct printbuf buf = PRINTBUF;
3197
bch2_log_msg_start(c, &buf);
3198
prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
3199
BTREE_TRANS_MEM_MAX);
3200
3201
bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
3202
bch2_print_str(c, KERN_ERR, buf.buf);
3203
printbuf_exit(&buf);
3204
#endif
3205
}
3206
3207
ret = trans_maybe_inject_restart(trans, _RET_IP_);
3208
if (ret)
3209
return ERR_PTR(ret);
3210
3211
struct btree_transaction_stats *s = btree_trans_stats(trans);
3212
if (new_bytes > s->max_mem) {
3213
mutex_lock(&s->lock);
3214
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
3215
darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
3216
s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
3217
trans->trans_kmalloc_trace.nr);
3218
3219
memcpy(s->trans_kmalloc_trace.data,
3220
trans->trans_kmalloc_trace.data,
3221
sizeof(s->trans_kmalloc_trace.data[0]) *
3222
s->trans_kmalloc_trace.nr);
3223
#endif
3224
s->max_mem = new_bytes;
3225
mutex_unlock(&s->lock);
3226
}
3227
3228
if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) {
3229
EBUG_ON(trans->mem_bytes >= new_bytes);
3230
return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
3231
}
3232
3233
if (old_bytes) {
3234
trans->realloc_bytes_required = new_bytes;
3235
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
3236
return ERR_PTR(btree_trans_restart_ip(trans,
3237
BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
3238
}
3239
3240
EBUG_ON(trans->mem);
3241
3242
new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
3243
if (unlikely(!new_mem)) {
3244
bch2_trans_unlock(trans);
3245
3246
new_mem = kmalloc(new_bytes, GFP_KERNEL);
3247
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
3248
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
3249
new_bytes = BTREE_TRANS_MEM_MAX;
3250
trans->used_mempool = true;
3251
}
3252
3253
EBUG_ON(!new_mem);
3254
3255
trans->mem = new_mem;
3256
trans->mem_bytes = new_bytes;
3257
3258
ret = bch2_trans_relock(trans);
3259
if (ret)
3260
return ERR_PTR(ret);
3261
}
3262
3263
trans->mem = new_mem;
3264
trans->mem_bytes = new_bytes;
3265
3266
p = trans->mem + trans->mem_top;
3267
trans->mem_top += size;
3268
memset(p, 0, size);
3269
return p;
3270
}
3271
3272
static inline void check_srcu_held_too_long(struct btree_trans *trans)
3273
{
3274
WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
3275
"btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
3276
(jiffies - trans->srcu_lock_time) / HZ);
3277
}
3278
3279
void bch2_trans_srcu_unlock(struct btree_trans *trans)
3280
{
3281
if (trans->srcu_held) {
3282
struct bch_fs *c = trans->c;
3283
struct btree_path *path;
3284
unsigned i;
3285
3286
trans_for_each_path(trans, path, i)
3287
if (path->cached && !btree_node_locked(path, 0))
3288
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
3289
3290
check_srcu_held_too_long(trans);
3291
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
3292
trans->srcu_held = false;
3293
}
3294
}
3295
3296
static void bch2_trans_srcu_lock(struct btree_trans *trans)
3297
{
3298
if (!trans->srcu_held) {
3299
trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
3300
trans->srcu_lock_time = jiffies;
3301
trans->srcu_held = true;
3302
}
3303
}
3304
3305
/**
3306
* bch2_trans_begin() - reset a transaction after a interrupted attempt
3307
* @trans: transaction to reset
3308
*
3309
* Returns: current restart counter, to be used with trans_was_restarted()
3310
*
3311
* While iterating over nodes or updating nodes a attempt to lock a btree node
3312
* may return BCH_ERR_transaction_restart when the trylock fails. When this
3313
* occurs bch2_trans_begin() should be called and the transaction retried.
3314
*/
3315
u32 bch2_trans_begin(struct btree_trans *trans)
3316
{
3317
struct btree_path *path;
3318
unsigned i;
3319
u64 now;
3320
3321
bch2_trans_reset_updates(trans);
3322
3323
trans->restart_count++;
3324
trans->mem_top = 0;
3325
3326
if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) {
3327
EBUG_ON(!trans->mem || !trans->mem_bytes);
3328
unsigned new_bytes = trans->realloc_bytes_required;
3329
void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
3330
if (unlikely(!new_mem)) {
3331
bch2_trans_unlock(trans);
3332
new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
3333
3334
EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
3335
3336
if (!new_mem) {
3337
new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
3338
new_bytes = BTREE_TRANS_MEM_MAX;
3339
trans->used_mempool = true;
3340
kfree(trans->mem);
3341
}
3342
}
3343
trans->mem = new_mem;
3344
trans->mem_bytes = new_bytes;
3345
}
3346
3347
trans_for_each_path(trans, path, i) {
3348
path->should_be_locked = false;
3349
3350
/*
3351
* If the transaction wasn't restarted, we're presuming to be
3352
* doing something new: dont keep iterators excpt the ones that
3353
* are in use - except for the subvolumes btree:
3354
*/
3355
if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
3356
path->preserve = false;
3357
3358
/*
3359
* XXX: we probably shouldn't be doing this if the transaction
3360
* was restarted, but currently we still overflow transaction
3361
* iterators if we do that
3362
*/
3363
if (!path->ref && !path->preserve)
3364
__bch2_path_free(trans, i);
3365
else
3366
path->preserve = false;
3367
}
3368
3369
now = local_clock();
3370
3371
if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
3372
time_after64(now, trans->last_begin_time + 10))
3373
__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
3374
trans->last_begin_time, now);
3375
3376
if (!trans->restarted &&
3377
(need_resched() ||
3378
time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
3379
bch2_trans_unlock(trans);
3380
cond_resched();
3381
now = local_clock();
3382
}
3383
trans->last_begin_time = now;
3384
3385
if (unlikely(trans->srcu_held &&
3386
time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
3387
bch2_trans_srcu_unlock(trans);
3388
3389
trans->last_begin_ip = _RET_IP_;
3390
3391
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
3392
if (trans->restarted) {
3393
trans->restart_count_this_trans++;
3394
} else {
3395
trans->restart_count_this_trans = 0;
3396
}
3397
#endif
3398
3399
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
3400
trans->trans_kmalloc_trace.nr = 0;
3401
#endif
3402
3403
trans_set_locked(trans, false);
3404
3405
if (trans->restarted) {
3406
bch2_btree_path_traverse_all(trans);
3407
trans->notrace_relock_fail = false;
3408
}
3409
3410
bch2_trans_verify_not_unlocked_or_in_restart(trans);
3411
return trans->restart_count;
3412
}
3413
3414
const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
3415
3416
unsigned bch2_trans_get_fn_idx(const char *fn)
3417
{
3418
for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
3419
if (!bch2_btree_transaction_fns[i] ||
3420
bch2_btree_transaction_fns[i] == fn) {
3421
bch2_btree_transaction_fns[i] = fn;
3422
return i;
3423
}
3424
3425
pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
3426
return 0;
3427
}
3428
3429
struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
3430
__acquires(&c->btree_trans_barrier)
3431
{
3432
struct btree_trans *trans;
3433
3434
if (IS_ENABLED(__KERNEL__)) {
3435
trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
3436
if (trans) {
3437
memset(trans, 0, offsetof(struct btree_trans, list));
3438
goto got_trans;
3439
}
3440
}
3441
3442
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
3443
memset(trans, 0, sizeof(*trans));
3444
3445
seqmutex_lock(&c->btree_trans_lock);
3446
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
3447
struct btree_trans *pos;
3448
pid_t pid = current->pid;
3449
3450
trans->locking_wait.task = current;
3451
3452
list_for_each_entry(pos, &c->btree_trans_list, list) {
3453
struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
3454
/*
3455
* We'd much prefer to be stricter here and completely
3456
* disallow multiple btree_trans in the same thread -
3457
* but the data move path calls bch2_write when we
3458
* already have a btree_trans initialized.
3459
*/
3460
BUG_ON(pos_task &&
3461
pid == pos_task->pid &&
3462
pos->locked);
3463
}
3464
}
3465
3466
list_add(&trans->list, &c->btree_trans_list);
3467
seqmutex_unlock(&c->btree_trans_lock);
3468
got_trans:
3469
trans->c = c;
3470
trans->last_begin_time = local_clock();
3471
trans->fn_idx = fn_idx;
3472
trans->locking_wait.task = current;
3473
trans->journal_replay_not_finished =
3474
unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
3475
atomic_inc_not_zero(&c->journal_keys.ref);
3476
trans->nr_paths = ARRAY_SIZE(trans->_paths);
3477
trans->paths_allocated = trans->_paths_allocated;
3478
trans->sorted = trans->_sorted;
3479
trans->paths = trans->_paths;
3480
trans->updates = trans->_updates;
3481
3482
*trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
3483
3484
trans->paths_allocated[0] = 1;
3485
3486
static struct lock_class_key lockdep_key;
3487
lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
3488
3489
if (fn_idx < BCH_TRANSACTIONS_NR) {
3490
trans->fn = bch2_btree_transaction_fns[fn_idx];
3491
3492
struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
3493
3494
if (s->max_mem) {
3495
unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
3496
3497
trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
3498
if (likely(trans->mem))
3499
trans->mem_bytes = expected_mem_bytes;
3500
}
3501
3502
trans->nr_paths_max = s->nr_max_paths;
3503
}
3504
3505
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
3506
trans->srcu_lock_time = jiffies;
3507
trans->srcu_held = true;
3508
trans_set_locked(trans, false);
3509
3510
closure_init_stack_release(&trans->ref);
3511
return trans;
3512
}
3513
3514
#ifdef CONFIG_BCACHEFS_DEBUG
3515
3516
static bool btree_paths_leaked(struct btree_trans *trans)
3517
{
3518
struct btree_path *path;
3519
unsigned i;
3520
3521
trans_for_each_path(trans, path, i)
3522
if (path->ref)
3523
return true;
3524
return false;
3525
}
3526
3527
static void check_btree_paths_leaked(struct btree_trans *trans)
3528
{
3529
if (btree_paths_leaked(trans)) {
3530
struct bch_fs *c = trans->c;
3531
struct btree_path *path;
3532
unsigned i;
3533
3534
struct printbuf buf = PRINTBUF;
3535
bch2_log_msg_start(c, &buf);
3536
3537
prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
3538
trans_for_each_path(trans, path, i)
3539
if (path->ref)
3540
prt_printf(&buf, "btree %s %pS\n",
3541
bch2_btree_id_str(path->btree_id),
3542
(void *) path->ip_allocated);
3543
3544
bch2_fs_emergency_read_only2(c, &buf);
3545
bch2_print_str(c, KERN_ERR, buf.buf);
3546
printbuf_exit(&buf);
3547
}
3548
}
3549
#else
3550
static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
3551
#endif
3552
3553
void bch2_trans_put(struct btree_trans *trans)
3554
__releases(&c->btree_trans_barrier)
3555
{
3556
struct bch_fs *c = trans->c;
3557
3558
if (trans->restarted)
3559
bch2_trans_in_restart_error(trans);
3560
3561
bch2_trans_unlock(trans);
3562
3563
trans_for_each_update(trans, i)
3564
__btree_path_put(trans, trans->paths + i->path, true);
3565
trans->nr_updates = 0;
3566
3567
check_btree_paths_leaked(trans);
3568
3569
if (trans->srcu_held) {
3570
check_srcu_held_too_long(trans);
3571
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
3572
}
3573
3574
if (unlikely(trans->journal_replay_not_finished))
3575
bch2_journal_keys_put(c);
3576
3577
/*
3578
* trans->ref protects trans->locking_wait.task, btree_paths array; used
3579
* by cycle detector
3580
*/
3581
closure_return_sync(&trans->ref);
3582
trans->locking_wait.task = NULL;
3583
3584
#ifdef CONFIG_BCACHEFS_DEBUG
3585
darray_exit(&trans->last_restarted_trace);
3586
#endif
3587
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
3588
darray_exit(&trans->trans_kmalloc_trace);
3589
#endif
3590
3591
unsigned long *paths_allocated = trans->paths_allocated;
3592
trans->paths_allocated = NULL;
3593
trans->paths = NULL;
3594
3595
if (paths_allocated != trans->_paths_allocated)
3596
kvfree_rcu_mightsleep(paths_allocated);
3597
3598
if (trans->used_mempool)
3599
mempool_free(trans->mem, &c->btree_trans_mem_pool);
3600
else
3601
kfree(trans->mem);
3602
3603
/* Userspace doesn't have a real percpu implementation: */
3604
if (IS_ENABLED(__KERNEL__))
3605
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
3606
3607
if (trans) {
3608
seqmutex_lock(&c->btree_trans_lock);
3609
list_del(&trans->list);
3610
seqmutex_unlock(&c->btree_trans_lock);
3611
3612
mempool_free(trans, &c->btree_trans_pool);
3613
}
3614
}
3615
3616
bool bch2_current_has_btree_trans(struct bch_fs *c)
3617
{
3618
seqmutex_lock(&c->btree_trans_lock);
3619
struct btree_trans *trans;
3620
bool ret = false;
3621
list_for_each_entry(trans, &c->btree_trans_list, list)
3622
if (trans->locking_wait.task == current &&
3623
trans->locked) {
3624
ret = true;
3625
break;
3626
}
3627
seqmutex_unlock(&c->btree_trans_lock);
3628
return ret;
3629
}
3630
3631
static void __maybe_unused
3632
bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
3633
struct btree_bkey_cached_common *b)
3634
{
3635
struct six_lock_count c = six_lock_counts(&b->lock);
3636
pid_t pid;
3637
3638
scoped_guard(rcu) {
3639
struct task_struct *owner = READ_ONCE(b->lock.owner);
3640
pid = owner ? owner->pid : 0;
3641
}
3642
3643
prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
3644
bch2_btree_id_to_text(out, b->btree_id);
3645
prt_printf(out, " l=%u:", b->level);
3646
bch2_bpos_to_text(out, btree_node_pos(b));
3647
3648
prt_printf(out, "\t locks %u:%u:%u held by pid %u",
3649
c.n[0], c.n[1], c.n[2], pid);
3650
}
3651
3652
void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
3653
{
3654
struct btree_bkey_cached_common *b;
3655
static char lock_types[] = { 'r', 'i', 'w' };
3656
struct task_struct *task = READ_ONCE(trans->locking_wait.task);
3657
unsigned l, idx;
3658
3659
/* before rcu_read_lock(): */
3660
bch2_printbuf_make_room(out, 4096);
3661
3662
if (!out->nr_tabstops) {
3663
printbuf_tabstop_push(out, 16);
3664
printbuf_tabstop_push(out, 32);
3665
}
3666
3667
prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
3668
3669
/* trans->paths is rcu protected vs. freeing */
3670
guard(rcu)();
3671
out->atomic++;
3672
3673
struct btree_path *paths = rcu_dereference(trans->paths);
3674
if (!paths)
3675
goto out;
3676
3677
unsigned long *paths_allocated = trans_paths_allocated(paths);
3678
3679
trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
3680
struct btree_path *path = paths + idx;
3681
if (!path->nodes_locked)
3682
continue;
3683
3684
prt_printf(out, " path %u %c ",
3685
idx,
3686
path->cached ? 'c' : 'b');
3687
bch2_btree_id_to_text(out, path->btree_id);
3688
prt_printf(out, " l=%u:", path->level);
3689
bch2_bpos_to_text(out, path->pos);
3690
prt_newline(out);
3691
3692
for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3693
if (btree_node_locked(path, l) &&
3694
!IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3695
prt_printf(out, " %c l=%u ",
3696
lock_types[btree_node_locked_type(path, l)], l);
3697
bch2_btree_bkey_cached_common_to_text(out, b);
3698
prt_newline(out);
3699
}
3700
}
3701
}
3702
3703
b = READ_ONCE(trans->locking);
3704
if (b) {
3705
prt_printf(out, " blocked for %lluus on\n",
3706
div_u64(local_clock() - trans->locking_wait.start_time, 1000));
3707
prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
3708
bch2_btree_bkey_cached_common_to_text(out, b);
3709
prt_newline(out);
3710
}
3711
out:
3712
--out->atomic;
3713
}
3714
3715
void bch2_fs_btree_iter_exit(struct bch_fs *c)
3716
{
3717
struct btree_transaction_stats *s;
3718
struct btree_trans *trans;
3719
int cpu;
3720
3721
if (c->btree_trans_bufs)
3722
for_each_possible_cpu(cpu) {
3723
struct btree_trans *trans =
3724
per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
3725
3726
if (trans) {
3727
seqmutex_lock(&c->btree_trans_lock);
3728
list_del(&trans->list);
3729
seqmutex_unlock(&c->btree_trans_lock);
3730
}
3731
kfree(trans);
3732
}
3733
free_percpu(c->btree_trans_bufs);
3734
3735
trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
3736
if (trans)
3737
panic("%s leaked btree_trans\n", trans->fn);
3738
3739
for (s = c->btree_transaction_stats;
3740
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3741
s++) {
3742
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
3743
darray_exit(&s->trans_kmalloc_trace);
3744
#endif
3745
kfree(s->max_paths_text);
3746
bch2_time_stats_exit(&s->lock_hold_times);
3747
}
3748
3749
if (c->btree_trans_barrier_initialized) {
3750
synchronize_srcu_expedited(&c->btree_trans_barrier);
3751
cleanup_srcu_struct(&c->btree_trans_barrier);
3752
}
3753
mempool_exit(&c->btree_trans_mem_pool);
3754
mempool_exit(&c->btree_trans_pool);
3755
}
3756
3757
void bch2_fs_btree_iter_init_early(struct bch_fs *c)
3758
{
3759
struct btree_transaction_stats *s;
3760
3761
for (s = c->btree_transaction_stats;
3762
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3763
s++) {
3764
bch2_time_stats_init(&s->duration);
3765
bch2_time_stats_init(&s->lock_hold_times);
3766
mutex_init(&s->lock);
3767
}
3768
3769
INIT_LIST_HEAD(&c->btree_trans_list);
3770
seqmutex_init(&c->btree_trans_lock);
3771
}
3772
3773
int bch2_fs_btree_iter_init(struct bch_fs *c)
3774
{
3775
int ret;
3776
3777
c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
3778
if (!c->btree_trans_bufs)
3779
return -ENOMEM;
3780
3781
ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
3782
sizeof(struct btree_trans)) ?:
3783
mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3784
BTREE_TRANS_MEM_MAX) ?:
3785
init_srcu_struct(&c->btree_trans_barrier);
3786
if (ret)
3787
return ret;
3788
3789
/*
3790
* static annotation (hackily done) for lock ordering of reclaim vs.
3791
* btree node locks:
3792
*/
3793
#ifdef CONFIG_LOCKDEP
3794
fs_reclaim_acquire(GFP_KERNEL);
3795
struct btree_trans *trans = bch2_trans_get(c);
3796
trans_set_locked(trans, false);
3797
bch2_trans_put(trans);
3798
fs_reclaim_release(GFP_KERNEL);
3799
#endif
3800
3801
c->btree_trans_barrier_initialized = true;
3802
return 0;
3803
3804
}
3805
3806