Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/bcachefs/btree_gc.c
26285 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2010 Kent Overstreet <[email protected]>
4
* Copyright (C) 2014 Datera Inc.
5
*/
6
7
#include "bcachefs.h"
8
#include "alloc_background.h"
9
#include "alloc_foreground.h"
10
#include "backpointers.h"
11
#include "bkey_methods.h"
12
#include "bkey_buf.h"
13
#include "btree_journal_iter.h"
14
#include "btree_key_cache.h"
15
#include "btree_locking.h"
16
#include "btree_node_scan.h"
17
#include "btree_update_interior.h"
18
#include "btree_io.h"
19
#include "btree_gc.h"
20
#include "buckets.h"
21
#include "clock.h"
22
#include "debug.h"
23
#include "disk_accounting.h"
24
#include "ec.h"
25
#include "enumerated_ref.h"
26
#include "error.h"
27
#include "extents.h"
28
#include "journal.h"
29
#include "keylist.h"
30
#include "move.h"
31
#include "progress.h"
32
#include "recovery_passes.h"
33
#include "reflink.h"
34
#include "recovery.h"
35
#include "replicas.h"
36
#include "super-io.h"
37
#include "trace.h"
38
39
#include <linux/slab.h>
40
#include <linux/bitops.h>
41
#include <linux/freezer.h>
42
#include <linux/kthread.h>
43
#include <linux/preempt.h>
44
#include <linux/rcupdate.h>
45
#include <linux/sched/task.h>
46
47
#define DROP_THIS_NODE 10
48
#define DROP_PREV_NODE 11
49
#define DID_FILL_FROM_SCAN 12
50
51
/*
52
* Returns true if it's a btree we can easily reconstruct, or otherwise won't
53
* cause data loss if it's missing:
54
*/
55
static bool btree_id_important(enum btree_id btree)
56
{
57
if (btree_id_is_alloc(btree))
58
return false;
59
60
switch (btree) {
61
case BTREE_ID_quotas:
62
case BTREE_ID_snapshot_trees:
63
case BTREE_ID_logged_ops:
64
case BTREE_ID_rebalance_work:
65
case BTREE_ID_subvolume_children:
66
return false;
67
default:
68
return true;
69
}
70
}
71
72
static const char * const bch2_gc_phase_strs[] = {
73
#define x(n) #n,
74
GC_PHASES()
75
#undef x
76
NULL
77
};
78
79
void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
80
{
81
prt_str(out, bch2_gc_phase_strs[p->phase]);
82
prt_char(out, ' ');
83
bch2_btree_id_level_to_text(out, p->btree, p->level);
84
prt_char(out, ' ');
85
bch2_bpos_to_text(out, p->pos);
86
}
87
88
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
89
{
90
return (struct bkey_s) {{{
91
(struct bkey *) k.k,
92
(struct bch_val *) k.v
93
}}};
94
}
95
96
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
97
{
98
preempt_disable();
99
write_seqcount_begin(&c->gc_pos_lock);
100
c->gc_pos = new_pos;
101
write_seqcount_end(&c->gc_pos_lock);
102
preempt_enable();
103
}
104
105
static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
106
{
107
BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
108
__gc_pos_set(c, new_pos);
109
}
110
111
static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
112
{
113
switch (b->key.k.type) {
114
case KEY_TYPE_btree_ptr: {
115
struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
116
117
dst->k.p = src->k.p;
118
dst->v.mem_ptr = 0;
119
dst->v.seq = b->data->keys.seq;
120
dst->v.sectors_written = 0;
121
dst->v.flags = 0;
122
dst->v.min_key = b->data->min_key;
123
set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
124
memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
125
break;
126
}
127
case KEY_TYPE_btree_ptr_v2:
128
bkey_copy(&dst->k_i, &b->key);
129
break;
130
default:
131
BUG();
132
}
133
}
134
135
static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
136
{
137
struct bkey_i_btree_ptr_v2 *new;
138
int ret;
139
140
if (c->opts.verbose) {
141
struct printbuf buf = PRINTBUF;
142
143
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
144
prt_str(&buf, " -> ");
145
bch2_bpos_to_text(&buf, new_min);
146
147
bch_info(c, "%s(): %s", __func__, buf.buf);
148
printbuf_exit(&buf);
149
}
150
151
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
152
if (!new)
153
return bch_err_throw(c, ENOMEM_gc_repair_key);
154
155
btree_ptr_to_v2(b, new);
156
b->data->min_key = new_min;
157
new->v.min_key = new_min;
158
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
159
160
ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
161
if (ret) {
162
kfree(new);
163
return ret;
164
}
165
166
bch2_btree_node_drop_keys_outside_node(b);
167
bkey_copy(&b->key, &new->k_i);
168
return 0;
169
}
170
171
static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
172
{
173
struct bkey_i_btree_ptr_v2 *new;
174
int ret;
175
176
if (c->opts.verbose) {
177
struct printbuf buf = PRINTBUF;
178
179
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
180
prt_str(&buf, " -> ");
181
bch2_bpos_to_text(&buf, new_max);
182
183
bch_info(c, "%s(): %s", __func__, buf.buf);
184
printbuf_exit(&buf);
185
}
186
187
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
188
if (ret)
189
return ret;
190
191
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
192
if (!new)
193
return bch_err_throw(c, ENOMEM_gc_repair_key);
194
195
btree_ptr_to_v2(b, new);
196
b->data->max_key = new_max;
197
new->k.p = new_max;
198
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
199
200
ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
201
if (ret) {
202
kfree(new);
203
return ret;
204
}
205
206
bch2_btree_node_drop_keys_outside_node(b);
207
208
mutex_lock(&c->btree_cache.lock);
209
__bch2_btree_node_hash_remove(&c->btree_cache, b);
210
211
bkey_copy(&b->key, &new->k_i);
212
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
213
BUG_ON(ret);
214
mutex_unlock(&c->btree_cache.lock);
215
return 0;
216
}
217
218
static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
219
struct btree *prev, struct btree *cur,
220
struct bpos *pulled_from_scan)
221
{
222
struct bch_fs *c = trans->c;
223
struct bpos expected_start = !prev
224
? b->data->min_key
225
: bpos_successor(prev->key.k.p);
226
struct printbuf buf = PRINTBUF;
227
int ret = 0;
228
229
BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
230
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
231
b->data->min_key));
232
233
if (bpos_eq(expected_start, cur->data->min_key))
234
return 0;
235
236
prt_printf(&buf, " at ");
237
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
238
prt_printf(&buf, ":\nparent: ");
239
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
240
241
if (prev) {
242
prt_printf(&buf, "\nprev: ");
243
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
244
}
245
246
prt_str(&buf, "\nnext: ");
247
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
248
249
if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
250
if (b->c.level == 1 &&
251
bpos_lt(*pulled_from_scan, cur->data->min_key)) {
252
ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
253
expected_start,
254
bpos_predecessor(cur->data->min_key));
255
if (ret)
256
goto err;
257
258
*pulled_from_scan = cur->data->min_key;
259
ret = DID_FILL_FROM_SCAN;
260
} else {
261
if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
262
"btree node with incorrect min_key%s", buf.buf))
263
ret = set_node_min(c, cur, expected_start);
264
}
265
} else { /* overlap */
266
if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
267
if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
268
if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
269
"btree node overwritten by next node%s", buf.buf))
270
ret = DROP_PREV_NODE;
271
} else {
272
if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
273
"btree node with incorrect max_key%s", buf.buf))
274
ret = set_node_max(c, prev,
275
bpos_predecessor(cur->data->min_key));
276
}
277
} else {
278
if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
279
if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
280
"btree node overwritten by prev node%s", buf.buf))
281
ret = DROP_THIS_NODE;
282
} else {
283
if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
284
"btree node with incorrect min_key%s", buf.buf))
285
ret = set_node_min(c, cur, expected_start);
286
}
287
}
288
}
289
err:
290
fsck_err:
291
printbuf_exit(&buf);
292
return ret;
293
}
294
295
static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
296
struct btree *child, struct bpos *pulled_from_scan)
297
{
298
struct bch_fs *c = trans->c;
299
struct printbuf buf = PRINTBUF;
300
int ret = 0;
301
302
if (bpos_eq(child->key.k.p, b->key.k.p))
303
return 0;
304
305
prt_printf(&buf, "\nat: ");
306
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
307
prt_printf(&buf, "\nparent: ");
308
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
309
310
prt_str(&buf, "\nchild: ");
311
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
312
313
if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
314
"btree node with incorrect max_key%s", buf.buf)) {
315
if (b->c.level == 1 &&
316
bpos_lt(*pulled_from_scan, b->key.k.p)) {
317
ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
318
bpos_successor(child->key.k.p), b->key.k.p);
319
if (ret)
320
goto err;
321
322
*pulled_from_scan = b->key.k.p;
323
ret = DID_FILL_FROM_SCAN;
324
} else {
325
ret = set_node_max(c, child, b->key.k.p);
326
}
327
}
328
err:
329
fsck_err:
330
printbuf_exit(&buf);
331
return ret;
332
}
333
334
static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
335
struct bpos *pulled_from_scan)
336
{
337
struct bch_fs *c = trans->c;
338
struct btree_and_journal_iter iter;
339
struct bkey_s_c k;
340
struct bkey_buf prev_k, cur_k;
341
struct btree *prev = NULL, *cur = NULL;
342
bool have_child, new_pass = false;
343
struct printbuf buf = PRINTBUF;
344
int ret = 0;
345
346
if (!b->c.level)
347
return 0;
348
349
bch2_bkey_buf_init(&prev_k);
350
bch2_bkey_buf_init(&cur_k);
351
again:
352
cur = prev = NULL;
353
have_child = new_pass = false;
354
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
355
iter.prefetch = true;
356
357
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
358
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
359
BUG_ON(bpos_gt(k.k->p, b->data->max_key));
360
361
bch2_btree_and_journal_iter_advance(&iter);
362
bch2_bkey_buf_reassemble(&cur_k, c, k);
363
364
cur = bch2_btree_node_get_noiter(trans, cur_k.k,
365
b->c.btree_id, b->c.level - 1,
366
false);
367
ret = PTR_ERR_OR_ZERO(cur);
368
369
printbuf_reset(&buf);
370
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
371
prt_char(&buf, ' ');
372
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
373
374
if (bch2_err_matches(ret, EIO)) {
375
bch2_btree_node_evict(trans, cur_k.k);
376
cur = NULL;
377
ret = bch2_journal_key_delete(c, b->c.btree_id,
378
b->c.level, cur_k.k->k.p);
379
if (ret)
380
break;
381
continue;
382
}
383
384
bch_err_msg(c, ret, "getting btree node");
385
if (ret)
386
break;
387
388
if (bch2_btree_node_is_stale(c, cur)) {
389
bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
390
six_unlock_read(&cur->c.lock);
391
bch2_btree_node_evict(trans, cur_k.k);
392
ret = bch2_journal_key_delete(c, b->c.btree_id,
393
b->c.level, cur_k.k->k.p);
394
cur = NULL;
395
if (ret)
396
break;
397
continue;
398
}
399
400
ret = lockrestart_do(trans,
401
btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
402
if (ret < 0)
403
goto err;
404
405
if (ret == DID_FILL_FROM_SCAN) {
406
new_pass = true;
407
ret = 0;
408
}
409
410
if (ret == DROP_THIS_NODE) {
411
six_unlock_read(&cur->c.lock);
412
bch2_btree_node_evict(trans, cur_k.k);
413
ret = bch2_journal_key_delete(c, b->c.btree_id,
414
b->c.level, cur_k.k->k.p);
415
cur = NULL;
416
if (ret)
417
break;
418
continue;
419
}
420
421
if (prev)
422
six_unlock_read(&prev->c.lock);
423
prev = NULL;
424
425
if (ret == DROP_PREV_NODE) {
426
bch_info(c, "dropped prev node");
427
bch2_btree_node_evict(trans, prev_k.k);
428
ret = bch2_journal_key_delete(c, b->c.btree_id,
429
b->c.level, prev_k.k->k.p);
430
if (ret)
431
break;
432
433
bch2_btree_and_journal_iter_exit(&iter);
434
goto again;
435
} else if (ret)
436
break;
437
438
prev = cur;
439
cur = NULL;
440
bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
441
}
442
443
if (!ret && !IS_ERR_OR_NULL(prev)) {
444
BUG_ON(cur);
445
ret = lockrestart_do(trans,
446
btree_repair_node_end(trans, b, prev, pulled_from_scan));
447
if (ret == DID_FILL_FROM_SCAN) {
448
new_pass = true;
449
ret = 0;
450
}
451
}
452
453
if (!IS_ERR_OR_NULL(prev))
454
six_unlock_read(&prev->c.lock);
455
prev = NULL;
456
if (!IS_ERR_OR_NULL(cur))
457
six_unlock_read(&cur->c.lock);
458
cur = NULL;
459
460
if (ret)
461
goto err;
462
463
bch2_btree_and_journal_iter_exit(&iter);
464
465
if (new_pass)
466
goto again;
467
468
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
469
iter.prefetch = true;
470
471
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
472
bch2_bkey_buf_reassemble(&cur_k, c, k);
473
bch2_btree_and_journal_iter_advance(&iter);
474
475
cur = bch2_btree_node_get_noiter(trans, cur_k.k,
476
b->c.btree_id, b->c.level - 1,
477
false);
478
ret = PTR_ERR_OR_ZERO(cur);
479
480
bch_err_msg(c, ret, "getting btree node");
481
if (ret)
482
goto err;
483
484
ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
485
six_unlock_read(&cur->c.lock);
486
cur = NULL;
487
488
if (ret == DROP_THIS_NODE) {
489
bch2_btree_node_evict(trans, cur_k.k);
490
ret = bch2_journal_key_delete(c, b->c.btree_id,
491
b->c.level, cur_k.k->k.p);
492
new_pass = true;
493
}
494
495
if (ret)
496
goto err;
497
498
have_child = true;
499
}
500
501
printbuf_reset(&buf);
502
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
503
prt_newline(&buf);
504
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
505
506
/*
507
* XXX: we're not passing the trans object here because we're not set up
508
* to handle a transaction restart - this code needs to be rewritten
509
* when we start doing online topology repair
510
*/
511
bch2_trans_unlock_long(trans);
512
if (mustfix_fsck_err_on(!have_child,
513
c, btree_node_topology_interior_node_empty,
514
"empty interior btree node at %s", buf.buf))
515
ret = DROP_THIS_NODE;
516
err:
517
fsck_err:
518
if (!IS_ERR_OR_NULL(prev))
519
six_unlock_read(&prev->c.lock);
520
if (!IS_ERR_OR_NULL(cur))
521
six_unlock_read(&cur->c.lock);
522
523
bch2_btree_and_journal_iter_exit(&iter);
524
525
if (!ret && new_pass)
526
goto again;
527
528
BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
529
530
bch2_bkey_buf_exit(&prev_k, c);
531
bch2_bkey_buf_exit(&cur_k, c);
532
printbuf_exit(&buf);
533
bch_err_fn(c, ret);
534
return ret;
535
}
536
537
static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
538
bool *reconstructed_root)
539
{
540
struct bch_fs *c = trans->c;
541
struct btree_root *r = bch2_btree_id_root(c, btree);
542
struct printbuf buf = PRINTBUF;
543
int ret = 0;
544
545
bch2_btree_id_to_text(&buf, btree);
546
547
if (r->error) {
548
bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
549
550
ret = bch2_btree_has_scanned_nodes(c, btree);
551
if (ret < 0)
552
goto err;
553
554
if (!ret) {
555
__fsck_err(trans,
556
FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
557
btree_root_unreadable_and_scan_found_nothing,
558
"no nodes found for btree %s, continue?", buf.buf);
559
560
r->alive = false;
561
r->error = 0;
562
bch2_btree_root_alloc_fake_trans(trans, btree, 0);
563
} else {
564
r->alive = false;
565
r->error = 0;
566
bch2_btree_root_alloc_fake_trans(trans, btree, 1);
567
568
bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
569
ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
570
if (ret)
571
goto err;
572
}
573
574
*reconstructed_root = true;
575
}
576
err:
577
fsck_err:
578
printbuf_exit(&buf);
579
bch_err_fn(c, ret);
580
return ret;
581
}
582
583
int bch2_check_topology(struct bch_fs *c)
584
{
585
struct btree_trans *trans = bch2_trans_get(c);
586
struct bpos pulled_from_scan = POS_MIN;
587
int ret = 0;
588
589
bch2_trans_srcu_unlock(trans);
590
591
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
592
bool reconstructed_root = false;
593
recover:
594
ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
595
if (ret)
596
break;
597
598
struct btree_root *r = bch2_btree_id_root(c, i);
599
struct btree *b = r->b;
600
601
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
602
ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
603
six_unlock_read(&b->c.lock);
604
605
if (ret == DROP_THIS_NODE) {
606
mutex_lock(&c->btree_cache.lock);
607
bch2_btree_node_hash_remove(&c->btree_cache, b);
608
mutex_unlock(&c->btree_cache.lock);
609
610
r->b = NULL;
611
612
if (!reconstructed_root) {
613
r->error = -EIO;
614
goto recover;
615
}
616
617
struct printbuf buf = PRINTBUF;
618
bch2_btree_id_to_text(&buf, i);
619
bch_err(c, "empty btree root %s", buf.buf);
620
printbuf_exit(&buf);
621
bch2_btree_root_alloc_fake_trans(trans, i, 0);
622
r->alive = false;
623
ret = 0;
624
}
625
}
626
627
bch2_trans_put(trans);
628
return ret;
629
}
630
631
/* marking of btree keys/nodes: */
632
633
static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
634
unsigned level, struct btree **prev,
635
struct btree_iter *iter, struct bkey_s_c k,
636
bool initial)
637
{
638
struct bch_fs *c = trans->c;
639
640
if (iter) {
641
struct btree_path *path = btree_iter_path(trans, iter);
642
struct btree *b = path_l(path)->b;
643
644
if (*prev != b) {
645
int ret = bch2_btree_node_check_topology(trans, b);
646
if (ret)
647
return ret;
648
}
649
*prev = b;
650
}
651
652
struct bkey deleted = KEY(0, 0, 0);
653
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
654
struct printbuf buf = PRINTBUF;
655
int ret = 0;
656
657
deleted.p = k.k->p;
658
659
if (initial) {
660
BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
661
k.k->bversion.lo > atomic64_read(&c->journal.seq));
662
663
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
664
k.k->bversion.lo > atomic64_read(&c->key_version),
665
trans, bkey_version_in_future,
666
"key version number higher than recorded %llu\n%s",
667
atomic64_read(&c->key_version),
668
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
669
atomic64_set(&c->key_version, k.k->bversion.lo);
670
}
671
672
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
673
trans, btree_bitmap_not_marked,
674
"btree ptr not marked in member info btree allocated bitmap\n%s",
675
(printbuf_reset(&buf),
676
bch2_bkey_val_to_text(&buf, c, k),
677
buf.buf))) {
678
mutex_lock(&c->sb_lock);
679
bch2_dev_btree_bitmap_mark(c, k);
680
bch2_write_super(c);
681
mutex_unlock(&c->sb_lock);
682
}
683
684
/*
685
* We require a commit before key_trigger() because
686
* key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
687
* wrong result if we run it multiple times.
688
*/
689
unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
690
691
ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
692
BTREE_TRIGGER_check_repair|flags);
693
if (ret)
694
goto out;
695
696
if (trans->nr_updates) {
697
ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
698
-BCH_ERR_transaction_restart_nested;
699
goto out;
700
}
701
702
ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
703
BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
704
out:
705
fsck_err:
706
printbuf_exit(&buf);
707
bch_err_fn(c, ret);
708
return ret;
709
}
710
711
static int bch2_gc_btree(struct btree_trans *trans,
712
struct progress_indicator_state *progress,
713
enum btree_id btree, bool initial)
714
{
715
struct bch_fs *c = trans->c;
716
unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
717
int ret = 0;
718
719
/* We need to make sure every leaf node is readable before going RW */
720
if (initial)
721
target_depth = 0;
722
723
for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
724
struct btree *prev = NULL;
725
struct btree_iter iter;
726
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
727
BTREE_ITER_prefetch);
728
729
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
730
bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
731
gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
732
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
733
}));
734
if (ret)
735
goto err;
736
}
737
738
/* root */
739
do {
740
retry_root:
741
bch2_trans_begin(trans);
742
743
struct btree_iter iter;
744
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
745
0, bch2_btree_id_root(c, btree)->b->c.level, 0);
746
struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
747
ret = PTR_ERR_OR_ZERO(b);
748
if (ret)
749
goto err_root;
750
751
if (b != btree_node_root(c, b)) {
752
bch2_trans_iter_exit(trans, &iter);
753
goto retry_root;
754
}
755
756
gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
757
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
758
ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
759
err_root:
760
bch2_trans_iter_exit(trans, &iter);
761
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
762
err:
763
bch_err_fn(c, ret);
764
return ret;
765
}
766
767
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
768
{
769
return cmp_int(gc_btree_order(l), gc_btree_order(r));
770
}
771
772
static int bch2_gc_btrees(struct bch_fs *c)
773
{
774
struct btree_trans *trans = bch2_trans_get(c);
775
struct printbuf buf = PRINTBUF;
776
int ret = 0;
777
778
struct progress_indicator_state progress;
779
bch2_progress_init(&progress, c, ~0ULL);
780
781
enum btree_id ids[BTREE_ID_NR];
782
for (unsigned i = 0; i < BTREE_ID_NR; i++)
783
ids[i] = i;
784
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
785
786
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
787
unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
788
789
if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
790
continue;
791
792
ret = bch2_gc_btree(trans, &progress, btree, true);
793
}
794
795
printbuf_exit(&buf);
796
bch2_trans_put(trans);
797
bch_err_fn(c, ret);
798
return ret;
799
}
800
801
static int bch2_mark_superblocks(struct bch_fs *c)
802
{
803
gc_pos_set(c, gc_phase(GC_PHASE_sb));
804
805
return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
806
}
807
808
static void bch2_gc_free(struct bch_fs *c)
809
{
810
bch2_accounting_gc_free(c);
811
812
genradix_free(&c->reflink_gc_table);
813
genradix_free(&c->gc_stripes);
814
815
for_each_member_device(c, ca)
816
genradix_free(&ca->buckets_gc);
817
}
818
819
static int bch2_gc_start(struct bch_fs *c)
820
{
821
for_each_member_device(c, ca) {
822
int ret = bch2_dev_usage_init(ca, true);
823
if (ret) {
824
bch2_dev_put(ca);
825
return ret;
826
}
827
}
828
829
return 0;
830
}
831
832
/* returns true if not equal */
833
static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
834
struct bch_alloc_v4 r)
835
{
836
return l.gen != r.gen ||
837
l.oldest_gen != r.oldest_gen ||
838
l.data_type != r.data_type ||
839
l.dirty_sectors != r.dirty_sectors ||
840
l.stripe_sectors != r.stripe_sectors ||
841
l.cached_sectors != r.cached_sectors ||
842
l.stripe_redundancy != r.stripe_redundancy ||
843
l.stripe != r.stripe;
844
}
845
846
static int bch2_alloc_write_key(struct btree_trans *trans,
847
struct btree_iter *iter,
848
struct bch_dev *ca,
849
struct bkey_s_c k)
850
{
851
struct bch_fs *c = trans->c;
852
struct bkey_i_alloc_v4 *a;
853
struct bch_alloc_v4 old_gc, gc, old_convert, new;
854
const struct bch_alloc_v4 *old;
855
int ret;
856
857
if (!bucket_valid(ca, k.k->p.offset))
858
return 0;
859
860
old = bch2_alloc_to_v4(k, &old_convert);
861
gc = new = *old;
862
863
__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
864
865
old_gc = gc;
866
867
if ((old->data_type == BCH_DATA_sb ||
868
old->data_type == BCH_DATA_journal) &&
869
!bch2_dev_is_online(ca)) {
870
gc.data_type = old->data_type;
871
gc.dirty_sectors = old->dirty_sectors;
872
}
873
874
/*
875
* gc.data_type doesn't yet include need_discard & need_gc_gen states -
876
* fix that here:
877
*/
878
alloc_data_type_set(&gc, gc.data_type);
879
if (gc.data_type != old_gc.data_type ||
880
gc.dirty_sectors != old_gc.dirty_sectors) {
881
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
882
if (ret)
883
return ret;
884
885
/*
886
* Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
887
* safe w.r.t. transaction restarts, so fixup the gc_bucket so
888
* we don't run it twice:
889
*/
890
struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
891
gc_m->data_type = gc.data_type;
892
gc_m->dirty_sectors = gc.dirty_sectors;
893
}
894
895
if (fsck_err_on(new.data_type != gc.data_type,
896
trans, alloc_key_data_type_wrong,
897
"bucket %llu:%llu gen %u has wrong data_type"
898
": got %s, should be %s",
899
iter->pos.inode, iter->pos.offset,
900
gc.gen,
901
bch2_data_type_str(new.data_type),
902
bch2_data_type_str(gc.data_type)))
903
new.data_type = gc.data_type;
904
905
#define copy_bucket_field(_errtype, _f) \
906
if (fsck_err_on(new._f != gc._f, \
907
trans, _errtype, \
908
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
909
": got %llu, should be %llu", \
910
iter->pos.inode, iter->pos.offset, \
911
gc.gen, \
912
bch2_data_type_str(gc.data_type), \
913
(u64) new._f, (u64) gc._f)) \
914
new._f = gc._f; \
915
916
copy_bucket_field(alloc_key_gen_wrong, gen);
917
copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
918
copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors);
919
copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
920
copy_bucket_field(alloc_key_stripe_wrong, stripe);
921
copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
922
#undef copy_bucket_field
923
924
if (!bch2_alloc_v4_cmp(*old, new))
925
return 0;
926
927
a = bch2_alloc_to_v4_mut(trans, k);
928
ret = PTR_ERR_OR_ZERO(a);
929
if (ret)
930
return ret;
931
932
a->v = new;
933
934
/*
935
* The trigger normally makes sure these are set, but we're not running
936
* triggers:
937
*/
938
if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
939
a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
940
941
ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
942
fsck_err:
943
return ret;
944
}
945
946
static int bch2_gc_alloc_done(struct bch_fs *c)
947
{
948
int ret = 0;
949
950
for_each_member_device(c, ca) {
951
ret = bch2_trans_run(c,
952
for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
953
POS(ca->dev_idx, ca->mi.first_bucket),
954
POS(ca->dev_idx, ca->mi.nbuckets - 1),
955
BTREE_ITER_slots|BTREE_ITER_prefetch, k,
956
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
957
bch2_alloc_write_key(trans, &iter, ca, k)));
958
if (ret) {
959
bch2_dev_put(ca);
960
break;
961
}
962
}
963
964
bch_err_fn(c, ret);
965
return ret;
966
}
967
968
static int bch2_gc_alloc_start(struct bch_fs *c)
969
{
970
int ret = 0;
971
972
for_each_member_device(c, ca) {
973
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
974
if (ret) {
975
bch2_dev_put(ca);
976
ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
977
break;
978
}
979
}
980
981
bch_err_fn(c, ret);
982
return ret;
983
}
984
985
static int bch2_gc_write_stripes_key(struct btree_trans *trans,
986
struct btree_iter *iter,
987
struct bkey_s_c k)
988
{
989
struct bch_fs *c = trans->c;
990
struct printbuf buf = PRINTBUF;
991
const struct bch_stripe *s;
992
struct gc_stripe *m;
993
bool bad = false;
994
unsigned i;
995
int ret = 0;
996
997
if (k.k->type != KEY_TYPE_stripe)
998
return 0;
999
1000
s = bkey_s_c_to_stripe(k).v;
1001
m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
1002
1003
for (i = 0; i < s->nr_blocks; i++) {
1004
u32 old = stripe_blockcount_get(s, i);
1005
u32 new = (m ? m->block_sectors[i] : 0);
1006
1007
if (old != new) {
1008
prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
1009
i, old, new);
1010
bad = true;
1011
}
1012
}
1013
1014
if (bad)
1015
bch2_bkey_val_to_text(&buf, c, k);
1016
1017
if (fsck_err_on(bad,
1018
trans, stripe_sector_count_wrong,
1019
"%s", buf.buf)) {
1020
struct bkey_i_stripe *new;
1021
1022
new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1023
ret = PTR_ERR_OR_ZERO(new);
1024
if (ret)
1025
return ret;
1026
1027
bkey_reassemble(&new->k_i, k);
1028
1029
for (i = 0; i < new->v.nr_blocks; i++)
1030
stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
1031
1032
ret = bch2_trans_update(trans, iter, &new->k_i, 0);
1033
}
1034
fsck_err:
1035
printbuf_exit(&buf);
1036
return ret;
1037
}
1038
1039
static int bch2_gc_stripes_done(struct bch_fs *c)
1040
{
1041
return bch2_trans_run(c,
1042
for_each_btree_key_commit(trans, iter,
1043
BTREE_ID_stripes, POS_MIN,
1044
BTREE_ITER_prefetch, k,
1045
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1046
bch2_gc_write_stripes_key(trans, &iter, k)));
1047
}
1048
1049
/**
1050
* bch2_check_allocations - walk all references to buckets, and recompute them:
1051
*
1052
* @c: filesystem object
1053
*
1054
* Returns: 0 on success, or standard errcode on failure
1055
*
1056
* Order matters here:
1057
* - Concurrent GC relies on the fact that we have a total ordering for
1058
* everything that GC walks - see gc_will_visit_node(),
1059
* gc_will_visit_root()
1060
*
1061
* - also, references move around in the course of index updates and
1062
* various other crap: everything needs to agree on the ordering
1063
* references are allowed to move around in - e.g., we're allowed to
1064
* start with a reference owned by an open_bucket (the allocator) and
1065
* move it to the btree, but not the reverse.
1066
*
1067
* This is necessary to ensure that gc doesn't miss references that
1068
* move around - if references move backwards in the ordering GC
1069
* uses, GC could skip past them
1070
*/
1071
int bch2_check_allocations(struct bch_fs *c)
1072
{
1073
int ret;
1074
1075
down_read(&c->state_lock);
1076
down_write(&c->gc_lock);
1077
1078
bch2_btree_interior_updates_flush(c);
1079
1080
ret = bch2_gc_accounting_start(c) ?:
1081
bch2_gc_start(c) ?:
1082
bch2_gc_alloc_start(c) ?:
1083
bch2_gc_reflink_start(c);
1084
if (ret)
1085
goto out;
1086
1087
gc_pos_set(c, gc_phase(GC_PHASE_start));
1088
1089
ret = bch2_mark_superblocks(c);
1090
bch_err_msg(c, ret, "marking superblocks");
1091
if (ret)
1092
goto out;
1093
1094
ret = bch2_gc_btrees(c);
1095
if (ret)
1096
goto out;
1097
1098
c->gc_count++;
1099
1100
ret = bch2_gc_alloc_done(c) ?:
1101
bch2_gc_accounting_done(c) ?:
1102
bch2_gc_stripes_done(c) ?:
1103
bch2_gc_reflink_done(c);
1104
out:
1105
percpu_down_write(&c->mark_lock);
1106
/* Indicates that gc is no longer in progress: */
1107
__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
1108
1109
bch2_gc_free(c);
1110
percpu_up_write(&c->mark_lock);
1111
1112
up_write(&c->gc_lock);
1113
up_read(&c->state_lock);
1114
1115
/*
1116
* At startup, allocations can happen directly instead of via the
1117
* allocator thread - issue wakeup in case they blocked on gc_lock:
1118
*/
1119
closure_wake_up(&c->freelist_wait);
1120
1121
if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
1122
bch2_sb_members_clean_deleted(c);
1123
1124
bch_err_fn(c, ret);
1125
return ret;
1126
}
1127
1128
static int gc_btree_gens_key(struct btree_trans *trans,
1129
struct btree_iter *iter,
1130
struct bkey_s_c k)
1131
{
1132
struct bch_fs *c = trans->c;
1133
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1134
1135
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
1136
return -EROFS;
1137
1138
bool too_stale = false;
1139
scoped_guard(rcu) {
1140
bkey_for_each_ptr(ptrs, ptr) {
1141
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1142
if (!ca)
1143
continue;
1144
1145
too_stale |= dev_ptr_stale(ca, ptr) > 16;
1146
}
1147
1148
if (!too_stale)
1149
bkey_for_each_ptr(ptrs, ptr) {
1150
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1151
if (!ca)
1152
continue;
1153
1154
u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
1155
if (gen_after(*gen, ptr->gen))
1156
*gen = ptr->gen;
1157
}
1158
}
1159
1160
if (too_stale) {
1161
struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
1162
int ret = PTR_ERR_OR_ZERO(u);
1163
if (ret)
1164
return ret;
1165
1166
bch2_extent_normalize(c, bkey_i_to_s(u));
1167
}
1168
1169
return 0;
1170
}
1171
1172
static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
1173
struct btree_iter *iter, struct bkey_s_c k)
1174
{
1175
struct bch_alloc_v4 a_convert;
1176
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1177
struct bkey_i_alloc_v4 *a_mut;
1178
int ret;
1179
1180
if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
1181
return 0;
1182
1183
a_mut = bch2_alloc_to_v4_mut(trans, k);
1184
ret = PTR_ERR_OR_ZERO(a_mut);
1185
if (ret)
1186
return ret;
1187
1188
a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
1189
1190
return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
1191
}
1192
1193
int bch2_gc_gens(struct bch_fs *c)
1194
{
1195
u64 b, start_time = local_clock();
1196
int ret;
1197
1198
if (!mutex_trylock(&c->gc_gens_lock))
1199
return 0;
1200
1201
trace_and_count(c, gc_gens_start, c);
1202
1203
/*
1204
* We have to use trylock here. Otherwise, we would
1205
* introduce a deadlock in the RO path - we take the
1206
* state lock at the start of going RO.
1207
*/
1208
if (!down_read_trylock(&c->state_lock)) {
1209
mutex_unlock(&c->gc_gens_lock);
1210
return 0;
1211
}
1212
1213
for_each_member_device(c, ca) {
1214
struct bucket_gens *gens = bucket_gens(ca);
1215
1216
BUG_ON(ca->oldest_gen);
1217
1218
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
1219
if (!ca->oldest_gen) {
1220
bch2_dev_put(ca);
1221
ret = bch_err_throw(c, ENOMEM_gc_gens);
1222
goto err;
1223
}
1224
1225
for (b = gens->first_bucket;
1226
b < gens->nbuckets; b++)
1227
ca->oldest_gen[b] = gens->b[b];
1228
}
1229
1230
for (unsigned i = 0; i < BTREE_ID_NR; i++)
1231
if (btree_type_has_ptrs(i)) {
1232
c->gc_gens_btree = i;
1233
c->gc_gens_pos = POS_MIN;
1234
1235
ret = bch2_trans_run(c,
1236
for_each_btree_key_commit(trans, iter, i,
1237
POS_MIN,
1238
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
1239
k,
1240
NULL, NULL,
1241
BCH_TRANS_COMMIT_no_enospc,
1242
gc_btree_gens_key(trans, &iter, k)));
1243
if (ret)
1244
goto err;
1245
}
1246
1247
struct bch_dev *ca = NULL;
1248
ret = bch2_trans_run(c,
1249
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1250
POS_MIN,
1251
BTREE_ITER_prefetch,
1252
k,
1253
NULL, NULL,
1254
BCH_TRANS_COMMIT_no_enospc, ({
1255
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
1256
if (!ca) {
1257
bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
1258
continue;
1259
}
1260
bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
1261
})));
1262
bch2_dev_put(ca);
1263
1264
if (ret)
1265
goto err;
1266
1267
c->gc_gens_btree = 0;
1268
c->gc_gens_pos = POS_MIN;
1269
1270
c->gc_count++;
1271
1272
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
1273
trace_and_count(c, gc_gens_end, c);
1274
err:
1275
for_each_member_device(c, ca) {
1276
kvfree(ca->oldest_gen);
1277
ca->oldest_gen = NULL;
1278
}
1279
1280
up_read(&c->state_lock);
1281
mutex_unlock(&c->gc_gens_lock);
1282
if (!bch2_err_matches(ret, EROFS))
1283
bch_err_fn(c, ret);
1284
return ret;
1285
}
1286
1287
static void bch2_gc_gens_work(struct work_struct *work)
1288
{
1289
struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
1290
bch2_gc_gens(c);
1291
enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
1292
}
1293
1294
void bch2_gc_gens_async(struct bch_fs *c)
1295
{
1296
if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
1297
!queue_work(c->write_ref_wq, &c->gc_gens_work))
1298
enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
1299
}
1300
1301
void bch2_fs_btree_gc_init_early(struct bch_fs *c)
1302
{
1303
seqcount_init(&c->gc_pos_lock);
1304
INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
1305
1306
init_rwsem(&c->gc_lock);
1307
mutex_init(&c->gc_gens_lock);
1308
}
1309
1310