Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/bcachefs/btree_node_scan.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include "bcachefs.h"
4
#include "btree_cache.h"
5
#include "btree_io.h"
6
#include "btree_journal_iter.h"
7
#include "btree_node_scan.h"
8
#include "btree_update_interior.h"
9
#include "buckets.h"
10
#include "error.h"
11
#include "journal_io.h"
12
#include "recovery_passes.h"
13
14
#include <linux/kthread.h>
15
#include <linux/min_heap.h>
16
#include <linux/sched/sysctl.h>
17
#include <linux/sort.h>
18
19
struct find_btree_nodes_worker {
20
struct closure *cl;
21
struct find_btree_nodes *f;
22
struct bch_dev *ca;
23
};
24
25
static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
26
{
27
bch2_btree_id_level_to_text(out, n->btree_id, n->level);
28
prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
29
n->seq, n->journal_seq, n->cookie);
30
bch2_bpos_to_text(out, n->min_key);
31
prt_str(out, "-");
32
bch2_bpos_to_text(out, n->max_key);
33
34
if (n->range_updated)
35
prt_str(out, " range updated");
36
37
for (unsigned i = 0; i < n->nr_ptrs; i++) {
38
prt_char(out, ' ');
39
bch2_extent_ptr_to_text(out, c, n->ptrs + i);
40
}
41
}
42
43
static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
44
{
45
printbuf_indent_add(out, 2);
46
darray_for_each(nodes, i) {
47
found_btree_node_to_text(out, c, i);
48
prt_newline(out);
49
}
50
printbuf_indent_sub(out, 2);
51
}
52
53
static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
54
{
55
struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
56
57
set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
58
bp->k.p = f->max_key;
59
bp->v.seq = cpu_to_le64(f->cookie);
60
bp->v.sectors_written = 0;
61
bp->v.flags = 0;
62
bp->v.sectors_written = cpu_to_le16(f->sectors_written);
63
bp->v.min_key = f->min_key;
64
SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
65
memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
66
}
67
68
static inline u64 bkey_journal_seq(struct bkey_s_c k)
69
{
70
switch (k.k->type) {
71
case KEY_TYPE_inode_v3:
72
return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
73
default:
74
return 0;
75
}
76
}
77
78
static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
79
{
80
const struct found_btree_node *l = _l;
81
const struct found_btree_node *r = _r;
82
83
return cmp_int(l->btree_id, r->btree_id) ?:
84
cmp_int(l->level, r->level) ?:
85
cmp_int(l->cookie, r->cookie);
86
}
87
88
/*
89
* Given two found btree nodes, if their sequence numbers are equal, take the
90
* one that's readable:
91
*/
92
static int found_btree_node_cmp_time(const struct found_btree_node *l,
93
const struct found_btree_node *r)
94
{
95
return cmp_int(l->seq, r->seq) ?:
96
cmp_int(l->journal_seq, r->journal_seq);
97
}
98
99
static int found_btree_node_cmp_pos(const void *_l, const void *_r)
100
{
101
const struct found_btree_node *l = _l;
102
const struct found_btree_node *r = _r;
103
104
return cmp_int(l->btree_id, r->btree_id) ?:
105
-cmp_int(l->level, r->level) ?:
106
bpos_cmp(l->min_key, r->min_key) ?:
107
-found_btree_node_cmp_time(l, r);
108
}
109
110
static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
111
{
112
return found_btree_node_cmp_pos(l, r) < 0;
113
}
114
115
static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
116
{
117
struct found_btree_node *l = _l;
118
struct found_btree_node *r = _r;
119
120
swap(*l, *r);
121
}
122
123
static const struct min_heap_callbacks found_btree_node_heap_cbs = {
124
.less = found_btree_node_cmp_pos_less,
125
.swp = found_btree_node_swap,
126
};
127
128
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
129
struct btree *b, struct bio *bio, u64 offset)
130
{
131
struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
132
struct btree_node *bn = b->data;
133
134
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
135
bio->bi_iter.bi_sector = offset;
136
bch2_bio_map(bio, b->data, c->opts.block_size);
137
138
u64 submit_time = local_clock();
139
submit_bio_wait(bio);
140
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
141
142
if (bio->bi_status) {
143
bch_err_dev_ratelimited(ca,
144
"IO error in try_read_btree_node() at %llu: %s",
145
offset, bch2_blk_status_to_str(bio->bi_status));
146
return;
147
}
148
149
if (le64_to_cpu(bn->magic) != bset_magic(c))
150
return;
151
152
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
153
if (!c->chacha20_key_set)
154
return;
155
156
struct nonce nonce = btree_nonce(&bn->keys, 0);
157
unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
158
159
bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
160
}
161
162
if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
163
return;
164
165
if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
166
return;
167
168
if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
169
return;
170
171
rcu_read_lock();
172
struct found_btree_node n = {
173
.btree_id = BTREE_NODE_ID(bn),
174
.level = BTREE_NODE_LEVEL(bn),
175
.seq = BTREE_NODE_SEQ(bn),
176
.cookie = le64_to_cpu(bn->keys.seq),
177
.min_key = bn->min_key,
178
.max_key = bn->max_key,
179
.nr_ptrs = 1,
180
.ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
181
.ptrs[0].offset = offset,
182
.ptrs[0].dev = ca->dev_idx,
183
.ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
184
};
185
rcu_read_unlock();
186
187
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
188
bio->bi_iter.bi_sector = offset;
189
bch2_bio_map(bio, b->data, c->opts.btree_node_size);
190
191
submit_time = local_clock();
192
submit_bio_wait(bio);
193
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
194
195
found_btree_node_to_key(&b->key, &n);
196
197
CLASS(printbuf, buf)();
198
if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
199
/* read_done will swap out b->data for another buffer */
200
bn = b->data;
201
/*
202
* Grab journal_seq here because we want the max journal_seq of
203
* any bset; read_done sorts down to a single set and picks the
204
* max journal_seq
205
*/
206
n.journal_seq = le64_to_cpu(bn->keys.journal_seq),
207
n.sectors_written = b->written;
208
209
mutex_lock(&f->lock);
210
if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
211
bch_err(c, "try_read_btree_node() can't handle endian conversion");
212
f->ret = -EINVAL;
213
goto unlock;
214
}
215
216
if (darray_push(&f->nodes, n))
217
f->ret = -ENOMEM;
218
unlock:
219
mutex_unlock(&f->lock);
220
}
221
}
222
223
static int read_btree_nodes_worker(void *p)
224
{
225
struct find_btree_nodes_worker *w = p;
226
struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
227
struct bch_dev *ca = w->ca;
228
unsigned long last_print = jiffies;
229
struct btree *b = NULL;
230
struct bio *bio = NULL;
231
232
b = __bch2_btree_node_mem_alloc(c);
233
if (!b) {
234
bch_err(c, "read_btree_nodes_worker: error allocating buf");
235
w->f->ret = -ENOMEM;
236
goto err;
237
}
238
239
bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
240
if (!bio) {
241
bch_err(c, "read_btree_nodes_worker: error allocating bio");
242
w->f->ret = -ENOMEM;
243
goto err;
244
}
245
246
for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
247
for (unsigned bucket_offset = 0;
248
bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
249
bucket_offset += btree_sectors(c)) {
250
if (time_after(jiffies, last_print + HZ * 30)) {
251
u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
252
u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
253
254
bch_info(ca, "%s: %2u%% done", __func__,
255
(unsigned) div64_u64(cur_sector * 100, end_sector));
256
last_print = jiffies;
257
}
258
259
u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
260
261
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
262
!bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
263
continue;
264
265
try_read_btree_node(w->f, ca, b, bio, sector);
266
}
267
err:
268
if (b)
269
__btree_node_data_free(b);
270
kfree(b);
271
bio_put(bio);
272
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
273
closure_put(w->cl);
274
kfree(w);
275
return 0;
276
}
277
278
static int read_btree_nodes(struct find_btree_nodes *f)
279
{
280
struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
281
struct closure cl;
282
int ret = 0;
283
284
closure_init_stack(&cl);
285
286
for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
287
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
288
continue;
289
290
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
291
if (!w) {
292
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
293
ret = -ENOMEM;
294
goto err;
295
}
296
297
w->cl = &cl;
298
w->f = f;
299
w->ca = ca;
300
301
struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
302
ret = PTR_ERR_OR_ZERO(t);
303
if (ret) {
304
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
305
kfree(w);
306
bch_err_msg(c, ret, "starting kthread");
307
break;
308
}
309
310
closure_get(&cl);
311
enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
312
wake_up_process(t);
313
}
314
err:
315
while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
316
;
317
return f->ret ?: ret;
318
}
319
320
static bool nodes_overlap(const struct found_btree_node *l,
321
const struct found_btree_node *r)
322
{
323
return (l->btree_id == r->btree_id &&
324
l->level == r->level &&
325
bpos_gt(l->max_key, r->min_key));
326
}
327
328
static int handle_overwrites(struct bch_fs *c,
329
struct found_btree_node *l,
330
found_btree_nodes *nodes_heap)
331
{
332
struct found_btree_node *r;
333
334
while ((r = min_heap_peek(nodes_heap)) &&
335
nodes_overlap(l, r)) {
336
int cmp = found_btree_node_cmp_time(l, r);
337
338
if (cmp > 0) {
339
if (bpos_cmp(l->max_key, r->max_key) >= 0)
340
min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
341
else {
342
r->range_updated = true;
343
r->min_key = bpos_successor(l->max_key);
344
r->range_updated = true;
345
min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
346
}
347
} else if (cmp < 0) {
348
BUG_ON(bpos_eq(l->min_key, r->min_key));
349
350
l->max_key = bpos_predecessor(r->min_key);
351
l->range_updated = true;
352
} else if (r->level) {
353
min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
354
} else {
355
if (bpos_cmp(l->max_key, r->max_key) >= 0)
356
min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
357
else {
358
r->range_updated = true;
359
r->min_key = bpos_successor(l->max_key);
360
r->range_updated = true;
361
min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
362
}
363
}
364
365
cond_resched();
366
}
367
368
return 0;
369
}
370
371
int bch2_scan_for_btree_nodes(struct bch_fs *c)
372
{
373
struct find_btree_nodes *f = &c->found_btree_nodes;
374
struct printbuf buf = PRINTBUF;
375
found_btree_nodes nodes_heap = {};
376
size_t dst;
377
int ret = 0;
378
379
if (f->nodes.nr)
380
return 0;
381
382
mutex_init(&f->lock);
383
384
ret = read_btree_nodes(f);
385
if (ret)
386
return ret;
387
388
if (!f->nodes.nr) {
389
bch_err(c, "%s: no btree nodes found", __func__);
390
ret = -EINVAL;
391
goto err;
392
}
393
394
if (0 && c->opts.verbose) {
395
printbuf_reset(&buf);
396
prt_printf(&buf, "%s: nodes found:\n", __func__);
397
found_btree_nodes_to_text(&buf, c, f->nodes);
398
bch2_print_str(c, KERN_INFO, buf.buf);
399
}
400
401
sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
402
403
dst = 0;
404
darray_for_each(f->nodes, i) {
405
struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
406
407
if (prev &&
408
prev->cookie == i->cookie) {
409
if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
410
bch_err(c, "%s: found too many replicas for btree node", __func__);
411
ret = -EINVAL;
412
goto err;
413
}
414
prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
415
} else {
416
f->nodes.data[dst++] = *i;
417
}
418
}
419
f->nodes.nr = dst;
420
421
sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
422
423
if (0 && c->opts.verbose) {
424
printbuf_reset(&buf);
425
prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
426
found_btree_nodes_to_text(&buf, c, f->nodes);
427
bch2_print_str(c, KERN_INFO, buf.buf);
428
}
429
430
swap(nodes_heap, f->nodes);
431
432
{
433
/* darray must have same layout as a heap */
434
min_heap_char real_heap;
435
BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
436
BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
437
BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
438
BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
439
}
440
441
min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
442
443
if (nodes_heap.nr) {
444
ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
445
if (ret)
446
goto err;
447
448
min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
449
}
450
451
while (true) {
452
ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
453
if (ret)
454
goto err;
455
456
if (!nodes_heap.nr)
457
break;
458
459
ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
460
if (ret)
461
goto err;
462
463
min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
464
}
465
466
for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
467
BUG_ON(nodes_overlap(n, n + 1));
468
469
if (0 && c->opts.verbose) {
470
printbuf_reset(&buf);
471
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
472
found_btree_nodes_to_text(&buf, c, f->nodes);
473
bch2_print_str(c, KERN_INFO, buf.buf);
474
} else {
475
bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
476
}
477
478
eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
479
err:
480
darray_exit(&nodes_heap);
481
printbuf_exit(&buf);
482
return ret;
483
}
484
485
static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
486
{
487
const struct found_btree_node *l = _l;
488
const struct found_btree_node *r = _r;
489
490
return cmp_int(l->btree_id, r->btree_id) ?:
491
-cmp_int(l->level, r->level) ?:
492
bpos_cmp(l->max_key, r->min_key);
493
}
494
495
#define for_each_found_btree_node_in_range(_f, _search, _idx) \
496
for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
497
sizeof((_f)->nodes.data[0]), \
498
found_btree_node_range_start_cmp, &search); \
499
_idx < (_f)->nodes.nr && \
500
(_f)->nodes.data[_idx].btree_id == _search.btree_id && \
501
(_f)->nodes.data[_idx].level == _search.level && \
502
bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
503
_idx = eytzinger0_next(_idx, (_f)->nodes.nr))
504
505
bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
506
{
507
struct find_btree_nodes *f = &c->found_btree_nodes;
508
509
struct found_btree_node search = {
510
.btree_id = b->c.btree_id,
511
.level = b->c.level,
512
.min_key = b->data->min_key,
513
.max_key = b->key.k.p,
514
};
515
516
for_each_found_btree_node_in_range(f, search, idx)
517
if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
518
return true;
519
return false;
520
}
521
522
int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
523
{
524
int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
525
if (ret)
526
return ret;
527
528
struct found_btree_node search = {
529
.btree_id = btree,
530
.level = 0,
531
.min_key = POS_MIN,
532
.max_key = SPOS_MAX,
533
};
534
535
for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
536
return true;
537
return false;
538
}
539
540
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
541
unsigned level, struct bpos node_min, struct bpos node_max)
542
{
543
if (btree_id_is_alloc(btree))
544
return 0;
545
546
struct find_btree_nodes *f = &c->found_btree_nodes;
547
548
int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
549
if (ret)
550
return ret;
551
552
if (c->opts.verbose) {
553
struct printbuf buf = PRINTBUF;
554
555
prt_str(&buf, "recovery ");
556
bch2_btree_id_level_to_text(&buf, btree, level);
557
prt_str(&buf, " ");
558
bch2_bpos_to_text(&buf, node_min);
559
prt_str(&buf, " - ");
560
bch2_bpos_to_text(&buf, node_max);
561
562
bch_info(c, "%s(): %s", __func__, buf.buf);
563
printbuf_exit(&buf);
564
}
565
566
struct found_btree_node search = {
567
.btree_id = btree,
568
.level = level,
569
.min_key = node_min,
570
.max_key = node_max,
571
};
572
573
for_each_found_btree_node_in_range(f, search, idx) {
574
struct found_btree_node n = f->nodes.data[idx];
575
576
n.range_updated |= bpos_lt(n.min_key, node_min);
577
n.min_key = bpos_max(n.min_key, node_min);
578
579
n.range_updated |= bpos_gt(n.max_key, node_max);
580
n.max_key = bpos_min(n.max_key, node_max);
581
582
struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
583
584
found_btree_node_to_key(&tmp.k, &n);
585
586
if (c->opts.verbose) {
587
struct printbuf buf = PRINTBUF;
588
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
589
bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
590
printbuf_exit(&buf);
591
}
592
593
BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
594
(struct bkey_validate_context) {
595
.from = BKEY_VALIDATE_btree_node,
596
.level = level + 1,
597
.btree = btree,
598
}));
599
600
ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
601
if (ret)
602
return ret;
603
}
604
605
return 0;
606
}
607
608
void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
609
{
610
darray_exit(&f->nodes);
611
}
612
613