Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/bcachefs/alloc_background.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include "bcachefs.h"
3
#include "alloc_background.h"
4
#include "alloc_foreground.h"
5
#include "backpointers.h"
6
#include "bkey_buf.h"
7
#include "btree_cache.h"
8
#include "btree_io.h"
9
#include "btree_key_cache.h"
10
#include "btree_update.h"
11
#include "btree_update_interior.h"
12
#include "btree_gc.h"
13
#include "btree_write_buffer.h"
14
#include "buckets.h"
15
#include "buckets_waiting_for_journal.h"
16
#include "clock.h"
17
#include "debug.h"
18
#include "disk_accounting.h"
19
#include "ec.h"
20
#include "enumerated_ref.h"
21
#include "error.h"
22
#include "lru.h"
23
#include "recovery.h"
24
#include "varint.h"
25
26
#include <linux/kthread.h>
27
#include <linux/math64.h>
28
#include <linux/random.h>
29
#include <linux/rculist.h>
30
#include <linux/rcupdate.h>
31
#include <linux/sched/task.h>
32
#include <linux/sort.h>
33
#include <linux/jiffies.h>
34
35
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
36
37
/* Persistent alloc info: */
38
39
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
40
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
41
BCH_ALLOC_FIELDS_V1()
42
#undef x
43
};
44
45
struct bkey_alloc_unpacked {
46
u64 journal_seq;
47
u8 gen;
48
u8 oldest_gen;
49
u8 data_type;
50
bool need_discard:1;
51
bool need_inc_gen:1;
52
#define x(_name, _bits) u##_bits _name;
53
BCH_ALLOC_FIELDS_V2()
54
#undef x
55
};
56
57
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
58
const void **p, unsigned field)
59
{
60
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
61
u64 v;
62
63
if (!(a->fields & (1 << field)))
64
return 0;
65
66
switch (bytes) {
67
case 1:
68
v = *((const u8 *) *p);
69
break;
70
case 2:
71
v = le16_to_cpup(*p);
72
break;
73
case 4:
74
v = le32_to_cpup(*p);
75
break;
76
case 8:
77
v = le64_to_cpup(*p);
78
break;
79
default:
80
BUG();
81
}
82
83
*p += bytes;
84
return v;
85
}
86
87
static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
88
struct bkey_s_c k)
89
{
90
const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
91
const void *d = in->data;
92
unsigned idx = 0;
93
94
out->gen = in->gen;
95
96
#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
97
BCH_ALLOC_FIELDS_V1()
98
#undef x
99
}
100
101
static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
102
struct bkey_s_c k)
103
{
104
struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
105
const u8 *in = a.v->data;
106
const u8 *end = bkey_val_end(a);
107
unsigned fieldnr = 0;
108
int ret;
109
u64 v;
110
111
out->gen = a.v->gen;
112
out->oldest_gen = a.v->oldest_gen;
113
out->data_type = a.v->data_type;
114
115
#define x(_name, _bits) \
116
if (fieldnr < a.v->nr_fields) { \
117
ret = bch2_varint_decode_fast(in, end, &v); \
118
if (ret < 0) \
119
return ret; \
120
in += ret; \
121
} else { \
122
v = 0; \
123
} \
124
out->_name = v; \
125
if (v != out->_name) \
126
return -1; \
127
fieldnr++;
128
129
BCH_ALLOC_FIELDS_V2()
130
#undef x
131
return 0;
132
}
133
134
static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
135
struct bkey_s_c k)
136
{
137
struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
138
const u8 *in = a.v->data;
139
const u8 *end = bkey_val_end(a);
140
unsigned fieldnr = 0;
141
int ret;
142
u64 v;
143
144
out->gen = a.v->gen;
145
out->oldest_gen = a.v->oldest_gen;
146
out->data_type = a.v->data_type;
147
out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
148
out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
149
out->journal_seq = le64_to_cpu(a.v->journal_seq);
150
151
#define x(_name, _bits) \
152
if (fieldnr < a.v->nr_fields) { \
153
ret = bch2_varint_decode_fast(in, end, &v); \
154
if (ret < 0) \
155
return ret; \
156
in += ret; \
157
} else { \
158
v = 0; \
159
} \
160
out->_name = v; \
161
if (v != out->_name) \
162
return -1; \
163
fieldnr++;
164
165
BCH_ALLOC_FIELDS_V2()
166
#undef x
167
return 0;
168
}
169
170
static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
171
{
172
struct bkey_alloc_unpacked ret = { .gen = 0 };
173
174
switch (k.k->type) {
175
case KEY_TYPE_alloc:
176
bch2_alloc_unpack_v1(&ret, k);
177
break;
178
case KEY_TYPE_alloc_v2:
179
bch2_alloc_unpack_v2(&ret, k);
180
break;
181
case KEY_TYPE_alloc_v3:
182
bch2_alloc_unpack_v3(&ret, k);
183
break;
184
}
185
186
return ret;
187
}
188
189
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
190
{
191
unsigned i, bytes = offsetof(struct bch_alloc, data);
192
193
for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
194
if (a->fields & (1 << i))
195
bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
196
197
return DIV_ROUND_UP(bytes, sizeof(u64));
198
}
199
200
int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
201
struct bkey_validate_context from)
202
{
203
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
204
int ret = 0;
205
206
/* allow for unknown fields */
207
bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
208
c, alloc_v1_val_size_bad,
209
"incorrect value size (%zu < %u)",
210
bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
211
fsck_err:
212
return ret;
213
}
214
215
int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
216
struct bkey_validate_context from)
217
{
218
struct bkey_alloc_unpacked u;
219
int ret = 0;
220
221
bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
222
c, alloc_v2_unpack_error,
223
"unpack error");
224
fsck_err:
225
return ret;
226
}
227
228
int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
229
struct bkey_validate_context from)
230
{
231
struct bkey_alloc_unpacked u;
232
int ret = 0;
233
234
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
235
c, alloc_v3_unpack_error,
236
"unpack error");
237
fsck_err:
238
return ret;
239
}
240
241
int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
242
struct bkey_validate_context from)
243
{
244
struct bch_alloc_v4 a;
245
int ret = 0;
246
247
bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
248
249
bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
250
c, alloc_v4_val_size_bad,
251
"bad val size (%u > %zu)",
252
alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
253
254
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
255
BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
256
c, alloc_v4_backpointers_start_bad,
257
"invalid backpointers_start");
258
259
bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
260
c, alloc_key_data_type_bad,
261
"invalid data type (got %u should be %u)",
262
a.data_type, alloc_data_type(a, a.data_type));
263
264
for (unsigned i = 0; i < 2; i++)
265
bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
266
c, alloc_key_io_time_bad,
267
"invalid io_time[%s]: %llu, max %llu",
268
i == READ ? "read" : "write",
269
a.io_time[i], LRU_TIME_MAX);
270
271
unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
272
offsetof(struct bch_alloc_v4, stripe_sectors)
273
? a.stripe_sectors
274
: 0;
275
276
switch (a.data_type) {
277
case BCH_DATA_free:
278
case BCH_DATA_need_gc_gens:
279
case BCH_DATA_need_discard:
280
bkey_fsck_err_on(stripe_sectors ||
281
a.dirty_sectors ||
282
a.cached_sectors ||
283
a.stripe,
284
c, alloc_key_empty_but_have_data,
285
"empty data type free but have data %u.%u.%u %u",
286
stripe_sectors,
287
a.dirty_sectors,
288
a.cached_sectors,
289
a.stripe);
290
break;
291
case BCH_DATA_sb:
292
case BCH_DATA_journal:
293
case BCH_DATA_btree:
294
case BCH_DATA_user:
295
case BCH_DATA_parity:
296
bkey_fsck_err_on(!a.dirty_sectors &&
297
!stripe_sectors,
298
c, alloc_key_dirty_sectors_0,
299
"data_type %s but dirty_sectors==0",
300
bch2_data_type_str(a.data_type));
301
break;
302
case BCH_DATA_cached:
303
bkey_fsck_err_on(!a.cached_sectors ||
304
a.dirty_sectors ||
305
stripe_sectors ||
306
a.stripe,
307
c, alloc_key_cached_inconsistency,
308
"data type inconsistency");
309
310
bkey_fsck_err_on(!a.io_time[READ] &&
311
!(c->recovery.passes_to_run &
312
BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)),
313
c, alloc_key_cached_but_read_time_zero,
314
"cached bucket with read_time == 0");
315
break;
316
case BCH_DATA_stripe:
317
break;
318
}
319
fsck_err:
320
return ret;
321
}
322
323
void bch2_alloc_v4_swab(struct bkey_s k)
324
{
325
struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
326
327
a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
328
a->journal_seq_empty = swab64(a->journal_seq_empty);
329
a->flags = swab32(a->flags);
330
a->dirty_sectors = swab32(a->dirty_sectors);
331
a->cached_sectors = swab32(a->cached_sectors);
332
a->io_time[0] = swab64(a->io_time[0]);
333
a->io_time[1] = swab64(a->io_time[1]);
334
a->stripe = swab32(a->stripe);
335
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
336
a->stripe_sectors = swab32(a->stripe_sectors);
337
}
338
339
static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
340
unsigned dev, const struct bch_alloc_v4 *a)
341
{
342
struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
343
344
prt_newline(out);
345
printbuf_indent_add(out, 2);
346
347
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
348
bch2_prt_data_type(out, a->data_type);
349
prt_newline(out);
350
prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
351
prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
352
prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
353
prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
354
prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
355
prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
356
prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
357
prt_printf(out, "stripe %u\n", a->stripe);
358
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
359
prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
360
prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
361
362
if (ca)
363
prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
364
prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
365
printbuf_indent_sub(out, 2);
366
367
bch2_dev_put(ca);
368
}
369
370
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
371
{
372
struct bch_alloc_v4 _a;
373
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
374
375
__bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
376
}
377
378
void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
379
{
380
__bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
381
}
382
383
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
384
{
385
if (k.k->type == KEY_TYPE_alloc_v4) {
386
void *src, *dst;
387
388
*out = *bkey_s_c_to_alloc_v4(k).v;
389
390
src = alloc_v4_backpointers(out);
391
SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
392
dst = alloc_v4_backpointers(out);
393
394
if (src < dst)
395
memset(src, 0, dst - src);
396
397
SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
398
} else {
399
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
400
401
*out = (struct bch_alloc_v4) {
402
.journal_seq_nonempty = u.journal_seq,
403
.flags = u.need_discard,
404
.gen = u.gen,
405
.oldest_gen = u.oldest_gen,
406
.data_type = u.data_type,
407
.stripe_redundancy = u.stripe_redundancy,
408
.dirty_sectors = u.dirty_sectors,
409
.cached_sectors = u.cached_sectors,
410
.io_time[READ] = u.read_time,
411
.io_time[WRITE] = u.write_time,
412
.stripe = u.stripe,
413
};
414
415
SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
416
}
417
}
418
419
static noinline struct bkey_i_alloc_v4 *
420
__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
421
{
422
struct bkey_i_alloc_v4 *ret;
423
424
ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
425
if (IS_ERR(ret))
426
return ret;
427
428
if (k.k->type == KEY_TYPE_alloc_v4) {
429
void *src, *dst;
430
431
bkey_reassemble(&ret->k_i, k);
432
433
src = alloc_v4_backpointers(&ret->v);
434
SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
435
dst = alloc_v4_backpointers(&ret->v);
436
437
if (src < dst)
438
memset(src, 0, dst - src);
439
440
SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
441
set_alloc_v4_u64s(ret);
442
} else {
443
bkey_alloc_v4_init(&ret->k_i);
444
ret->k.p = k.k->p;
445
bch2_alloc_to_v4(k, &ret->v);
446
}
447
return ret;
448
}
449
450
static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
451
{
452
struct bkey_s_c_alloc_v4 a;
453
454
if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
455
((a = bkey_s_c_to_alloc_v4(k), true) &&
456
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
457
return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
458
459
return __bch2_alloc_to_v4_mut(trans, k);
460
}
461
462
struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
463
{
464
return bch2_alloc_to_v4_mut_inlined(trans, k);
465
}
466
467
struct bkey_i_alloc_v4 *
468
bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
469
struct bpos pos)
470
{
471
struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
472
BTREE_ITER_with_updates|
473
BTREE_ITER_cached|
474
BTREE_ITER_intent);
475
int ret = bkey_err(k);
476
if (unlikely(ret))
477
return ERR_PTR(ret);
478
479
struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
480
ret = PTR_ERR_OR_ZERO(a);
481
if (unlikely(ret))
482
goto err;
483
return a;
484
err:
485
bch2_trans_iter_exit(trans, iter);
486
return ERR_PTR(ret);
487
}
488
489
__flatten
490
struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
491
enum btree_iter_update_trigger_flags flags)
492
{
493
struct btree_iter iter;
494
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos,
495
BTREE_ITER_with_updates|
496
BTREE_ITER_cached|
497
BTREE_ITER_intent);
498
int ret = bkey_err(k);
499
if (unlikely(ret))
500
return ERR_PTR(ret);
501
502
if ((void *) k.v >= trans->mem &&
503
(void *) k.v < trans->mem + trans->mem_top) {
504
bch2_trans_iter_exit(trans, &iter);
505
return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v);
506
}
507
508
struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
509
if (IS_ERR(a)) {
510
bch2_trans_iter_exit(trans, &iter);
511
return a;
512
}
513
514
ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
515
bch2_trans_iter_exit(trans, &iter);
516
return unlikely(ret) ? ERR_PTR(ret) : a;
517
}
518
519
static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
520
{
521
*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
522
523
pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
524
return pos;
525
}
526
527
static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
528
{
529
pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
530
pos.offset += offset;
531
return pos;
532
}
533
534
static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
535
{
536
return k.k->type == KEY_TYPE_bucket_gens
537
? bkey_s_c_to_bucket_gens(k).v->gens[offset]
538
: 0;
539
}
540
541
int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
542
struct bkey_validate_context from)
543
{
544
int ret = 0;
545
546
bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
547
c, bucket_gens_val_size_bad,
548
"bad val size (%zu != %zu)",
549
bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
550
fsck_err:
551
return ret;
552
}
553
554
void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
555
{
556
struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
557
unsigned i;
558
559
for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
560
if (i)
561
prt_char(out, ' ');
562
prt_printf(out, "%u", g.v->gens[i]);
563
}
564
}
565
566
int bch2_bucket_gens_init(struct bch_fs *c)
567
{
568
struct btree_trans *trans = bch2_trans_get(c);
569
struct bkey_i_bucket_gens g;
570
bool have_bucket_gens_key = false;
571
int ret;
572
573
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
574
BTREE_ITER_prefetch, k, ({
575
/*
576
* Not a fsck error because this is checked/repaired by
577
* bch2_check_alloc_key() which runs later:
578
*/
579
if (!bch2_dev_bucket_exists(c, k.k->p))
580
continue;
581
582
struct bch_alloc_v4 a;
583
u8 gen = bch2_alloc_to_v4(k, &a)->gen;
584
unsigned offset;
585
struct bpos pos = alloc_gens_pos(iter.pos, &offset);
586
int ret2 = 0;
587
588
if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
589
ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
590
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
591
if (ret2)
592
goto iter_err;
593
have_bucket_gens_key = false;
594
}
595
596
if (!have_bucket_gens_key) {
597
bkey_bucket_gens_init(&g.k_i);
598
g.k.p = pos;
599
have_bucket_gens_key = true;
600
}
601
602
g.v.gens[offset] = gen;
603
iter_err:
604
ret2;
605
}));
606
607
if (have_bucket_gens_key && !ret)
608
ret = commit_do(trans, NULL, NULL,
609
BCH_TRANS_COMMIT_no_enospc,
610
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
611
612
bch2_trans_put(trans);
613
614
bch_err_fn(c, ret);
615
return ret;
616
}
617
618
int bch2_alloc_read(struct bch_fs *c)
619
{
620
down_read(&c->state_lock);
621
622
struct btree_trans *trans = bch2_trans_get(c);
623
struct bch_dev *ca = NULL;
624
int ret;
625
626
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
627
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
628
BTREE_ITER_prefetch, k, ({
629
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
630
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
631
632
if (k.k->type != KEY_TYPE_bucket_gens)
633
continue;
634
635
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
636
/*
637
* Not a fsck error because this is checked/repaired by
638
* bch2_check_alloc_key() which runs later:
639
*/
640
if (!ca) {
641
bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
642
continue;
643
}
644
645
const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
646
647
for (u64 b = max_t(u64, ca->mi.first_bucket, start);
648
b < min_t(u64, ca->mi.nbuckets, end);
649
b++)
650
*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
651
0;
652
}));
653
} else {
654
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
655
BTREE_ITER_prefetch, k, ({
656
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
657
/*
658
* Not a fsck error because this is checked/repaired by
659
* bch2_check_alloc_key() which runs later:
660
*/
661
if (!ca) {
662
bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
663
continue;
664
}
665
666
if (k.k->p.offset < ca->mi.first_bucket) {
667
bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
668
continue;
669
}
670
671
if (k.k->p.offset >= ca->mi.nbuckets) {
672
bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
673
continue;
674
}
675
676
struct bch_alloc_v4 a;
677
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
678
0;
679
}));
680
}
681
682
bch2_dev_put(ca);
683
bch2_trans_put(trans);
684
685
up_read(&c->state_lock);
686
bch_err_fn(c, ret);
687
return ret;
688
}
689
690
/* Free space/discard btree: */
691
692
static int __need_discard_or_freespace_err(struct btree_trans *trans,
693
struct bkey_s_c alloc_k,
694
bool set, bool discard, bool repair)
695
{
696
struct bch_fs *c = trans->c;
697
enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
698
enum bch_sb_error_id err_id = discard
699
? BCH_FSCK_ERR_need_discard_key_wrong
700
: BCH_FSCK_ERR_freespace_key_wrong;
701
enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
702
struct printbuf buf = PRINTBUF;
703
704
bch2_bkey_val_to_text(&buf, c, alloc_k);
705
706
int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
707
"bucket incorrectly %sset in %s btree\n%s",
708
set ? "" : "un",
709
bch2_btree_id_str(btree),
710
buf.buf);
711
if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
712
bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
713
ret = 0;
714
715
printbuf_exit(&buf);
716
return ret;
717
}
718
719
#define need_discard_or_freespace_err(...) \
720
fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
721
722
#define need_discard_or_freespace_err_on(cond, ...) \
723
(unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false)
724
725
static int bch2_bucket_do_index(struct btree_trans *trans,
726
struct bch_dev *ca,
727
struct bkey_s_c alloc_k,
728
const struct bch_alloc_v4 *a,
729
bool set)
730
{
731
enum btree_id btree;
732
struct bpos pos;
733
734
if (a->data_type != BCH_DATA_free &&
735
a->data_type != BCH_DATA_need_discard)
736
return 0;
737
738
switch (a->data_type) {
739
case BCH_DATA_free:
740
btree = BTREE_ID_freespace;
741
pos = alloc_freespace_pos(alloc_k.k->p, *a);
742
break;
743
case BCH_DATA_need_discard:
744
btree = BTREE_ID_need_discard;
745
pos = alloc_k.k->p;
746
break;
747
default:
748
return 0;
749
}
750
751
struct btree_iter iter;
752
struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
753
int ret = bkey_err(old);
754
if (ret)
755
return ret;
756
757
need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
758
!old.k->type != set,
759
trans, alloc_k, set,
760
btree == BTREE_ID_need_discard, false);
761
762
ret = bch2_btree_bit_mod_iter(trans, &iter, set);
763
fsck_err:
764
bch2_trans_iter_exit(trans, &iter);
765
return ret;
766
}
767
768
static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
769
struct bpos bucket, u8 gen)
770
{
771
struct btree_iter iter;
772
unsigned offset;
773
struct bpos pos = alloc_gens_pos(bucket, &offset);
774
struct bkey_i_bucket_gens *g;
775
struct bkey_s_c k;
776
int ret;
777
778
g = bch2_trans_kmalloc(trans, sizeof(*g));
779
ret = PTR_ERR_OR_ZERO(g);
780
if (ret)
781
return ret;
782
783
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
784
BTREE_ITER_intent|
785
BTREE_ITER_with_updates);
786
ret = bkey_err(k);
787
if (ret)
788
return ret;
789
790
if (k.k->type != KEY_TYPE_bucket_gens) {
791
bkey_bucket_gens_init(&g->k_i);
792
g->k.p = iter.pos;
793
} else {
794
bkey_reassemble(&g->k_i, k);
795
}
796
797
g->v.gens[offset] = gen;
798
799
ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
800
bch2_trans_iter_exit(trans, &iter);
801
return ret;
802
}
803
804
static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
805
enum bch_data_type data_type,
806
s64 delta_buckets,
807
s64 delta_sectors,
808
s64 delta_fragmented, unsigned flags)
809
{
810
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
811
812
return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
813
d, dev_data_type,
814
.dev = ca->dev_idx,
815
.data_type = data_type);
816
}
817
818
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
819
const struct bch_alloc_v4 *old,
820
const struct bch_alloc_v4 *new,
821
unsigned flags)
822
{
823
s64 old_sectors = bch2_bucket_sectors(*old);
824
s64 new_sectors = bch2_bucket_sectors(*new);
825
if (old->data_type != new->data_type) {
826
int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
827
1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
828
bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
829
-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
830
if (ret)
831
return ret;
832
} else if (old_sectors != new_sectors) {
833
int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
834
0,
835
new_sectors - old_sectors,
836
bch2_bucket_sectors_fragmented(ca, *new) -
837
bch2_bucket_sectors_fragmented(ca, *old), flags);
838
if (ret)
839
return ret;
840
}
841
842
s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
843
s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
844
if (old_unstriped != new_unstriped) {
845
int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
846
!!new_unstriped - !!old_unstriped,
847
new_unstriped - old_unstriped,
848
0,
849
flags);
850
if (ret)
851
return ret;
852
}
853
854
return 0;
855
}
856
857
int bch2_trigger_alloc(struct btree_trans *trans,
858
enum btree_id btree, unsigned level,
859
struct bkey_s_c old, struct bkey_s new,
860
enum btree_iter_update_trigger_flags flags)
861
{
862
struct bch_fs *c = trans->c;
863
struct printbuf buf = PRINTBUF;
864
int ret = 0;
865
866
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
867
if (!ca)
868
return bch_err_throw(c, trigger_alloc);
869
870
struct bch_alloc_v4 old_a_convert;
871
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
872
873
struct bch_alloc_v4 *new_a;
874
if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
875
new_a = bkey_s_to_alloc_v4(new).v;
876
} else {
877
BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
878
879
struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
880
ret = PTR_ERR_OR_ZERO(new_ka);
881
if (unlikely(ret))
882
goto err;
883
new_a = &new_ka->v;
884
}
885
886
if (flags & BTREE_TRIGGER_transactional) {
887
alloc_data_type_set(new_a, new_a->data_type);
888
889
int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
890
(int) data_type_is_empty(old_a->data_type);
891
892
if (is_empty_delta < 0) {
893
new_a->io_time[READ] = bch2_current_io_time(c, READ);
894
new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
895
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
896
SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
897
}
898
899
if (data_type_is_empty(new_a->data_type) &&
900
BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
901
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
902
if (new_a->oldest_gen == new_a->gen &&
903
!bch2_bucket_sectors_total(*new_a))
904
new_a->oldest_gen++;
905
new_a->gen++;
906
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
907
alloc_data_type_set(new_a, new_a->data_type);
908
}
909
910
if (old_a->data_type != new_a->data_type ||
911
(new_a->data_type == BCH_DATA_free &&
912
alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
913
ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
914
bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
915
if (ret)
916
goto err;
917
}
918
919
if (new_a->data_type == BCH_DATA_cached &&
920
!new_a->io_time[READ])
921
new_a->io_time[READ] = bch2_current_io_time(c, READ);
922
923
ret = bch2_lru_change(trans, new.k->p.inode,
924
bucket_to_u64(new.k->p),
925
alloc_lru_idx_read(*old_a),
926
alloc_lru_idx_read(*new_a));
927
if (ret)
928
goto err;
929
930
ret = bch2_lru_change(trans,
931
BCH_LRU_BUCKET_FRAGMENTATION,
932
bucket_to_u64(new.k->p),
933
alloc_lru_idx_fragmentation(*old_a, ca),
934
alloc_lru_idx_fragmentation(*new_a, ca));
935
if (ret)
936
goto err;
937
938
if (old_a->gen != new_a->gen) {
939
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
940
if (ret)
941
goto err;
942
}
943
944
ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
945
if (ret)
946
goto err;
947
}
948
949
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
950
u64 transaction_seq = trans->journal_res.seq;
951
BUG_ON(!transaction_seq);
952
953
if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
954
trans, alloc_key_journal_seq_in_future,
955
"bucket journal seq in future (currently at %llu)\n%s",
956
journal_cur_seq(&c->journal),
957
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
958
new_a->journal_seq_nonempty = transaction_seq;
959
960
int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
961
(int) data_type_is_empty(old_a->data_type);
962
963
/*
964
* Record journal sequence number of empty -> nonempty transition:
965
* Note that there may be multiple empty -> nonempty
966
* transitions, data in a bucket may be overwritten while we're
967
* still writing to it - so be careful to only record the first:
968
* */
969
if (is_empty_delta < 0 &&
970
new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
971
new_a->journal_seq_nonempty = transaction_seq;
972
new_a->journal_seq_empty = 0;
973
}
974
975
/*
976
* Bucket becomes empty: mark it as waiting for a journal flush,
977
* unless updates since empty -> nonempty transition were never
978
* flushed - we may need to ask the journal not to flush
979
* intermediate sequence numbers:
980
*/
981
if (is_empty_delta > 0) {
982
if (new_a->journal_seq_nonempty == transaction_seq ||
983
bch2_journal_noflush_seq(&c->journal,
984
new_a->journal_seq_nonempty,
985
transaction_seq)) {
986
new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
987
} else {
988
new_a->journal_seq_empty = transaction_seq;
989
990
ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
991
c->journal.flushed_seq_ondisk,
992
new.k->p.inode, new.k->p.offset,
993
transaction_seq);
994
if (bch2_fs_fatal_err_on(ret, c,
995
"setting bucket_needs_journal_commit: %s",
996
bch2_err_str(ret)))
997
goto err;
998
}
999
}
1000
1001
if (new_a->gen != old_a->gen) {
1002
guard(rcu)();
1003
u8 *gen = bucket_gen(ca, new.k->p.offset);
1004
if (unlikely(!gen))
1005
goto invalid_bucket;
1006
*gen = new_a->gen;
1007
}
1008
1009
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
1010
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
1011
#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
1012
1013
if (statechange(a->data_type == BCH_DATA_free) &&
1014
bucket_flushed(new_a))
1015
closure_wake_up(&c->freelist_wait);
1016
1017
if (statechange(a->data_type == BCH_DATA_need_discard) &&
1018
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
1019
bucket_flushed(new_a))
1020
bch2_discard_one_bucket_fast(ca, new.k->p.offset);
1021
1022
if (statechange(a->data_type == BCH_DATA_cached) &&
1023
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
1024
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
1025
bch2_dev_do_invalidates(ca);
1026
1027
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
1028
bch2_gc_gens_async(c);
1029
}
1030
1031
if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
1032
guard(rcu)();
1033
struct bucket *g = gc_bucket(ca, new.k->p.offset);
1034
if (unlikely(!g))
1035
goto invalid_bucket;
1036
g->gen_valid = 1;
1037
g->gen = new_a->gen;
1038
}
1039
err:
1040
fsck_err:
1041
printbuf_exit(&buf);
1042
bch2_dev_put(ca);
1043
return ret;
1044
invalid_bucket:
1045
bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
1046
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
1047
ret = bch_err_throw(c, trigger_alloc);
1048
goto err;
1049
}
1050
1051
/*
1052
* This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
1053
* extents style btrees, but works on non-extents btrees:
1054
*/
1055
static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
1056
struct bpos end, struct bkey *hole)
1057
{
1058
struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
1059
1060
if (bkey_err(k))
1061
return k;
1062
1063
if (k.k->type) {
1064
return k;
1065
} else {
1066
struct btree_iter iter2;
1067
struct bpos next;
1068
1069
bch2_trans_copy_iter(trans, &iter2, iter);
1070
1071
struct btree_path *path = btree_iter_path(trans, iter);
1072
if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
1073
end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
1074
1075
end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
1076
1077
/*
1078
* btree node min/max is a closed interval, upto takes a half
1079
* open interval:
1080
*/
1081
k = bch2_btree_iter_peek_max(trans, &iter2, end);
1082
next = iter2.pos;
1083
bch2_trans_iter_exit(trans, &iter2);
1084
1085
BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
1086
1087
if (bkey_err(k))
1088
return k;
1089
1090
bkey_init(hole);
1091
hole->p = iter->pos;
1092
1093
bch2_key_resize(hole, next.offset - iter->pos.offset);
1094
return (struct bkey_s_c) { hole, NULL };
1095
}
1096
}
1097
1098
static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
1099
{
1100
if (*ca) {
1101
if (bucket->offset < (*ca)->mi.first_bucket)
1102
bucket->offset = (*ca)->mi.first_bucket;
1103
1104
if (bucket->offset < (*ca)->mi.nbuckets)
1105
return true;
1106
1107
bch2_dev_put(*ca);
1108
*ca = NULL;
1109
bucket->inode++;
1110
bucket->offset = 0;
1111
}
1112
1113
guard(rcu)();
1114
*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
1115
if (*ca) {
1116
*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
1117
bch2_dev_get(*ca);
1118
}
1119
1120
return *ca != NULL;
1121
}
1122
1123
static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
1124
struct btree_iter *iter,
1125
struct bch_dev **ca, struct bkey *hole)
1126
{
1127
struct bch_fs *c = trans->c;
1128
struct bkey_s_c k;
1129
again:
1130
k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
1131
if (bkey_err(k))
1132
return k;
1133
1134
*ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
1135
1136
if (!k.k->type) {
1137
struct bpos hole_start = bkey_start_pos(k.k);
1138
1139
if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
1140
if (!next_bucket(c, ca, &hole_start))
1141
return bkey_s_c_null;
1142
1143
bch2_btree_iter_set_pos(trans, iter, hole_start);
1144
goto again;
1145
}
1146
1147
if (k.k->p.offset > (*ca)->mi.nbuckets)
1148
bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
1149
}
1150
1151
return k;
1152
}
1153
1154
static noinline_for_stack
1155
int bch2_check_alloc_key(struct btree_trans *trans,
1156
struct bkey_s_c alloc_k,
1157
struct btree_iter *alloc_iter,
1158
struct btree_iter *discard_iter,
1159
struct btree_iter *freespace_iter,
1160
struct btree_iter *bucket_gens_iter)
1161
{
1162
struct bch_fs *c = trans->c;
1163
struct bch_alloc_v4 a_convert;
1164
const struct bch_alloc_v4 *a;
1165
unsigned gens_offset;
1166
struct bkey_s_c k;
1167
struct printbuf buf = PRINTBUF;
1168
int ret = 0;
1169
1170
struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
1171
if (fsck_err_on(!ca,
1172
trans, alloc_key_to_missing_dev_bucket,
1173
"alloc key for invalid device:bucket %llu:%llu",
1174
alloc_k.k->p.inode, alloc_k.k->p.offset))
1175
ret = bch2_btree_delete_at(trans, alloc_iter, 0);
1176
if (!ca)
1177
return ret;
1178
1179
if (!ca->mi.freespace_initialized)
1180
goto out;
1181
1182
a = bch2_alloc_to_v4(alloc_k, &a_convert);
1183
1184
bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
1185
k = bch2_btree_iter_peek_slot(trans, discard_iter);
1186
ret = bkey_err(k);
1187
if (ret)
1188
goto err;
1189
1190
bool is_discarded = a->data_type == BCH_DATA_need_discard;
1191
if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
1192
trans, alloc_k, !is_discarded, true, true)) {
1193
ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
1194
if (ret)
1195
goto err;
1196
}
1197
1198
bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1199
k = bch2_btree_iter_peek_slot(trans, freespace_iter);
1200
ret = bkey_err(k);
1201
if (ret)
1202
goto err;
1203
1204
bool is_free = a->data_type == BCH_DATA_free;
1205
if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
1206
trans, alloc_k, !is_free, false, true)) {
1207
ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
1208
if (ret)
1209
goto err;
1210
}
1211
1212
bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1213
k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
1214
ret = bkey_err(k);
1215
if (ret)
1216
goto err;
1217
1218
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
1219
trans, bucket_gens_key_wrong,
1220
"incorrect gen in bucket_gens btree (got %u should be %u)\n%s",
1221
alloc_gen(k, gens_offset), a->gen,
1222
(printbuf_reset(&buf),
1223
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1224
struct bkey_i_bucket_gens *g =
1225
bch2_trans_kmalloc(trans, sizeof(*g));
1226
1227
ret = PTR_ERR_OR_ZERO(g);
1228
if (ret)
1229
goto err;
1230
1231
if (k.k->type == KEY_TYPE_bucket_gens) {
1232
bkey_reassemble(&g->k_i, k);
1233
} else {
1234
bkey_bucket_gens_init(&g->k_i);
1235
g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1236
}
1237
1238
g->v.gens[gens_offset] = a->gen;
1239
1240
ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1241
if (ret)
1242
goto err;
1243
}
1244
out:
1245
err:
1246
fsck_err:
1247
bch2_dev_put(ca);
1248
printbuf_exit(&buf);
1249
return ret;
1250
}
1251
1252
static noinline_for_stack
1253
int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1254
struct bch_dev *ca,
1255
struct bpos start,
1256
struct bpos *end,
1257
struct btree_iter *freespace_iter)
1258
{
1259
struct bkey_s_c k;
1260
struct printbuf buf = PRINTBUF;
1261
int ret;
1262
1263
if (!ca->mi.freespace_initialized)
1264
return 0;
1265
1266
bch2_btree_iter_set_pos(trans, freespace_iter, start);
1267
1268
k = bch2_btree_iter_peek_slot(trans, freespace_iter);
1269
ret = bkey_err(k);
1270
if (ret)
1271
goto err;
1272
1273
*end = bkey_min(k.k->p, *end);
1274
1275
if (fsck_err_on(k.k->type != KEY_TYPE_set,
1276
trans, freespace_hole_missing,
1277
"hole in alloc btree missing in freespace btree\n"
1278
"device %llu buckets %llu-%llu",
1279
freespace_iter->pos.inode,
1280
freespace_iter->pos.offset,
1281
end->offset)) {
1282
struct bkey_i *update =
1283
bch2_trans_kmalloc(trans, sizeof(*update));
1284
1285
ret = PTR_ERR_OR_ZERO(update);
1286
if (ret)
1287
goto err;
1288
1289
bkey_init(&update->k);
1290
update->k.type = KEY_TYPE_set;
1291
update->k.p = freespace_iter->pos;
1292
bch2_key_resize(&update->k,
1293
min_t(u64, U32_MAX, end->offset -
1294
freespace_iter->pos.offset));
1295
1296
ret = bch2_trans_update(trans, freespace_iter, update, 0);
1297
if (ret)
1298
goto err;
1299
}
1300
err:
1301
fsck_err:
1302
printbuf_exit(&buf);
1303
return ret;
1304
}
1305
1306
static noinline_for_stack
1307
int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1308
struct bpos start,
1309
struct bpos *end,
1310
struct btree_iter *bucket_gens_iter)
1311
{
1312
struct bkey_s_c k;
1313
struct printbuf buf = PRINTBUF;
1314
unsigned i, gens_offset, gens_end_offset;
1315
int ret;
1316
1317
bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1318
1319
k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
1320
ret = bkey_err(k);
1321
if (ret)
1322
goto err;
1323
1324
if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1325
alloc_gens_pos(*end, &gens_end_offset)))
1326
gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1327
1328
if (k.k->type == KEY_TYPE_bucket_gens) {
1329
struct bkey_i_bucket_gens g;
1330
bool need_update = false;
1331
1332
bkey_reassemble(&g.k_i, k);
1333
1334
for (i = gens_offset; i < gens_end_offset; i++) {
1335
if (fsck_err_on(g.v.gens[i], trans,
1336
bucket_gens_hole_wrong,
1337
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1338
bucket_gens_pos_to_alloc(k.k->p, i).inode,
1339
bucket_gens_pos_to_alloc(k.k->p, i).offset,
1340
g.v.gens[i])) {
1341
g.v.gens[i] = 0;
1342
need_update = true;
1343
}
1344
}
1345
1346
if (need_update) {
1347
struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1348
1349
ret = PTR_ERR_OR_ZERO(u);
1350
if (ret)
1351
goto err;
1352
1353
memcpy(u, &g, sizeof(g));
1354
1355
ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1356
if (ret)
1357
goto err;
1358
}
1359
}
1360
1361
*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1362
err:
1363
fsck_err:
1364
printbuf_exit(&buf);
1365
return ret;
1366
}
1367
1368
struct check_discard_freespace_key_async {
1369
struct work_struct work;
1370
struct bch_fs *c;
1371
struct bbpos pos;
1372
};
1373
1374
static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
1375
{
1376
struct btree_iter iter;
1377
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
1378
int ret = bkey_err(k);
1379
if (ret)
1380
return ret;
1381
1382
u8 gen;
1383
ret = k.k->type != KEY_TYPE_set
1384
? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
1385
: 0;
1386
bch2_trans_iter_exit(trans, &iter);
1387
return ret;
1388
}
1389
1390
static void check_discard_freespace_key_work(struct work_struct *work)
1391
{
1392
struct check_discard_freespace_key_async *w =
1393
container_of(work, struct check_discard_freespace_key_async, work);
1394
1395
bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
1396
enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key);
1397
kfree(w);
1398
}
1399
1400
int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
1401
bool async_repair)
1402
{
1403
struct bch_fs *c = trans->c;
1404
enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1405
? BCH_DATA_need_discard
1406
: BCH_DATA_free;
1407
struct printbuf buf = PRINTBUF;
1408
1409
unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
1410
FSCK_CAN_FIX|FSCK_CAN_IGNORE;
1411
1412
struct bpos bucket = iter->pos;
1413
bucket.offset &= ~(~0ULL << 56);
1414
u64 genbits = iter->pos.offset & (~0ULL << 56);
1415
1416
struct btree_iter alloc_iter;
1417
struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
1418
BTREE_ID_alloc, bucket,
1419
async_repair ? BTREE_ITER_cached : 0);
1420
int ret = bkey_err(alloc_k);
1421
if (ret)
1422
return ret;
1423
1424
if (!bch2_dev_bucket_exists(c, bucket)) {
1425
if (__fsck_err(trans, fsck_flags,
1426
need_discard_freespace_key_to_invalid_dev_bucket,
1427
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
1428
bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
1429
goto delete;
1430
ret = 1;
1431
goto out;
1432
}
1433
1434
struct bch_alloc_v4 a_convert;
1435
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
1436
1437
if (a->data_type != state ||
1438
(state == BCH_DATA_free &&
1439
genbits != alloc_freespace_genbits(*a))) {
1440
if (__fsck_err(trans, fsck_flags,
1441
need_discard_freespace_key_bad,
1442
"%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1443
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1444
bch2_btree_id_str(iter->btree_id),
1445
iter->pos.inode,
1446
iter->pos.offset,
1447
a->data_type == state,
1448
genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1449
goto delete;
1450
ret = 1;
1451
goto out;
1452
}
1453
1454
*gen = a->gen;
1455
out:
1456
fsck_err:
1457
bch2_set_btree_iter_dontneed(trans, &alloc_iter);
1458
bch2_trans_iter_exit(trans, &alloc_iter);
1459
printbuf_exit(&buf);
1460
return ret;
1461
delete:
1462
if (!async_repair) {
1463
ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
1464
bch2_trans_commit(trans, NULL, NULL,
1465
BCH_TRANS_COMMIT_no_enospc) ?:
1466
bch_err_throw(c, transaction_restart_commit);
1467
goto out;
1468
} else {
1469
/*
1470
* We can't repair here when called from the allocator path: the
1471
* commit will recurse back into the allocator
1472
*/
1473
struct check_discard_freespace_key_async *w =
1474
kzalloc(sizeof(*w), GFP_KERNEL);
1475
if (!w)
1476
goto out;
1477
1478
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) {
1479
kfree(w);
1480
goto out;
1481
}
1482
1483
INIT_WORK(&w->work, check_discard_freespace_key_work);
1484
w->c = c;
1485
w->pos = BBPOS(iter->btree_id, iter->pos);
1486
queue_work(c->write_ref_wq, &w->work);
1487
1488
ret = 1; /* don't allocate from this bucket */
1489
goto out;
1490
}
1491
}
1492
1493
static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
1494
{
1495
u8 gen;
1496
int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
1497
return ret < 0 ? ret : 0;
1498
}
1499
1500
/*
1501
* We've already checked that generation numbers in the bucket_gens btree are
1502
* valid for buckets that exist; this just checks for keys for nonexistent
1503
* buckets.
1504
*/
1505
static noinline_for_stack
1506
int bch2_check_bucket_gens_key(struct btree_trans *trans,
1507
struct btree_iter *iter,
1508
struct bkey_s_c k)
1509
{
1510
struct bch_fs *c = trans->c;
1511
struct bkey_i_bucket_gens g;
1512
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1513
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1514
u64 b;
1515
bool need_update = false;
1516
struct printbuf buf = PRINTBUF;
1517
int ret = 0;
1518
1519
BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1520
bkey_reassemble(&g.k_i, k);
1521
1522
struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
1523
if (!ca) {
1524
if (fsck_err(trans, bucket_gens_to_invalid_dev,
1525
"bucket_gens key for invalid device:\n%s",
1526
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1527
ret = bch2_btree_delete_at(trans, iter, 0);
1528
goto out;
1529
}
1530
1531
if (fsck_err_on(end <= ca->mi.first_bucket ||
1532
start >= ca->mi.nbuckets,
1533
trans, bucket_gens_to_invalid_buckets,
1534
"bucket_gens key for invalid buckets:\n%s",
1535
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1536
ret = bch2_btree_delete_at(trans, iter, 0);
1537
goto out;
1538
}
1539
1540
for (b = start; b < ca->mi.first_bucket; b++)
1541
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1542
trans, bucket_gens_nonzero_for_invalid_buckets,
1543
"bucket_gens key has nonzero gen for invalid bucket")) {
1544
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1545
need_update = true;
1546
}
1547
1548
for (b = ca->mi.nbuckets; b < end; b++)
1549
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1550
trans, bucket_gens_nonzero_for_invalid_buckets,
1551
"bucket_gens key has nonzero gen for invalid bucket")) {
1552
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1553
need_update = true;
1554
}
1555
1556
if (need_update) {
1557
struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1558
1559
ret = PTR_ERR_OR_ZERO(u);
1560
if (ret)
1561
goto out;
1562
1563
memcpy(u, &g, sizeof(g));
1564
ret = bch2_trans_update(trans, iter, u, 0);
1565
}
1566
out:
1567
fsck_err:
1568
bch2_dev_put(ca);
1569
printbuf_exit(&buf);
1570
return ret;
1571
}
1572
1573
int bch2_check_alloc_info(struct bch_fs *c)
1574
{
1575
struct btree_trans *trans = bch2_trans_get(c);
1576
struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1577
struct bch_dev *ca = NULL;
1578
struct bkey hole;
1579
struct bkey_s_c k;
1580
int ret = 0;
1581
1582
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1583
BTREE_ITER_prefetch);
1584
bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1585
BTREE_ITER_prefetch);
1586
bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1587
BTREE_ITER_prefetch);
1588
bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1589
BTREE_ITER_prefetch);
1590
1591
while (1) {
1592
struct bpos next;
1593
1594
bch2_trans_begin(trans);
1595
1596
k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
1597
ret = bkey_err(k);
1598
if (ret)
1599
goto bkey_err;
1600
1601
if (!k.k)
1602
break;
1603
1604
if (k.k->type) {
1605
next = bpos_nosnap_successor(k.k->p);
1606
1607
ret = bch2_check_alloc_key(trans,
1608
k, &iter,
1609
&discard_iter,
1610
&freespace_iter,
1611
&bucket_gens_iter);
1612
if (ret)
1613
goto bkey_err;
1614
} else {
1615
next = k.k->p;
1616
1617
ret = bch2_check_alloc_hole_freespace(trans, ca,
1618
bkey_start_pos(k.k),
1619
&next,
1620
&freespace_iter) ?:
1621
bch2_check_alloc_hole_bucket_gens(trans,
1622
bkey_start_pos(k.k),
1623
&next,
1624
&bucket_gens_iter);
1625
if (ret)
1626
goto bkey_err;
1627
}
1628
1629
ret = bch2_trans_commit(trans, NULL, NULL,
1630
BCH_TRANS_COMMIT_no_enospc);
1631
if (ret)
1632
goto bkey_err;
1633
1634
bch2_btree_iter_set_pos(trans, &iter, next);
1635
bkey_err:
1636
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1637
continue;
1638
if (ret)
1639
break;
1640
}
1641
bch2_trans_iter_exit(trans, &bucket_gens_iter);
1642
bch2_trans_iter_exit(trans, &freespace_iter);
1643
bch2_trans_iter_exit(trans, &discard_iter);
1644
bch2_trans_iter_exit(trans, &iter);
1645
bch2_dev_put(ca);
1646
ca = NULL;
1647
1648
if (ret < 0)
1649
goto err;
1650
1651
ret = for_each_btree_key(trans, iter,
1652
BTREE_ID_need_discard, POS_MIN,
1653
BTREE_ITER_prefetch, k,
1654
bch2_check_discard_freespace_key_fsck(trans, &iter));
1655
if (ret)
1656
goto err;
1657
1658
bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
1659
BTREE_ITER_prefetch);
1660
while (1) {
1661
bch2_trans_begin(trans);
1662
k = bch2_btree_iter_peek(trans, &iter);
1663
if (!k.k)
1664
break;
1665
1666
ret = bkey_err(k) ?:
1667
bch2_check_discard_freespace_key_fsck(trans, &iter);
1668
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1669
ret = 0;
1670
continue;
1671
}
1672
if (ret) {
1673
struct printbuf buf = PRINTBUF;
1674
bch2_bkey_val_to_text(&buf, c, k);
1675
1676
bch_err(c, "while checking %s", buf.buf);
1677
printbuf_exit(&buf);
1678
break;
1679
}
1680
1681
bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
1682
}
1683
bch2_trans_iter_exit(trans, &iter);
1684
if (ret)
1685
goto err;
1686
1687
ret = for_each_btree_key_commit(trans, iter,
1688
BTREE_ID_bucket_gens, POS_MIN,
1689
BTREE_ITER_prefetch, k,
1690
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1691
bch2_check_bucket_gens_key(trans, &iter, k));
1692
err:
1693
bch2_trans_put(trans);
1694
bch_err_fn(c, ret);
1695
return ret;
1696
}
1697
1698
static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1699
struct btree_iter *alloc_iter,
1700
struct bkey_buf *last_flushed)
1701
{
1702
struct bch_fs *c = trans->c;
1703
struct bch_alloc_v4 a_convert;
1704
const struct bch_alloc_v4 *a;
1705
struct bkey_s_c alloc_k;
1706
struct printbuf buf = PRINTBUF;
1707
int ret;
1708
1709
alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
1710
if (!alloc_k.k)
1711
return 0;
1712
1713
ret = bkey_err(alloc_k);
1714
if (ret)
1715
return ret;
1716
1717
struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
1718
if (!ca)
1719
return 0;
1720
1721
a = bch2_alloc_to_v4(alloc_k, &a_convert);
1722
1723
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
1724
if (lru_idx) {
1725
ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
1726
bucket_to_u64(alloc_k.k->p),
1727
lru_idx, alloc_k, last_flushed);
1728
if (ret)
1729
goto err;
1730
}
1731
1732
if (a->data_type != BCH_DATA_cached)
1733
goto err;
1734
1735
if (fsck_err_on(!a->io_time[READ],
1736
trans, alloc_key_cached_but_read_time_zero,
1737
"cached bucket with read_time 0\n%s",
1738
(printbuf_reset(&buf),
1739
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1740
struct bkey_i_alloc_v4 *a_mut =
1741
bch2_alloc_to_v4_mut(trans, alloc_k);
1742
ret = PTR_ERR_OR_ZERO(a_mut);
1743
if (ret)
1744
goto err;
1745
1746
a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
1747
ret = bch2_trans_update(trans, alloc_iter,
1748
&a_mut->k_i, BTREE_TRIGGER_norun);
1749
if (ret)
1750
goto err;
1751
1752
a = &a_mut->v;
1753
}
1754
1755
ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
1756
bucket_to_u64(alloc_k.k->p),
1757
a->io_time[READ],
1758
alloc_k, last_flushed);
1759
if (ret)
1760
goto err;
1761
err:
1762
fsck_err:
1763
bch2_dev_put(ca);
1764
printbuf_exit(&buf);
1765
return ret;
1766
}
1767
1768
int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1769
{
1770
struct bkey_buf last_flushed;
1771
1772
bch2_bkey_buf_init(&last_flushed);
1773
bkey_init(&last_flushed.k->k);
1774
1775
int ret = bch2_trans_run(c,
1776
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1777
POS_MIN, BTREE_ITER_prefetch, k,
1778
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1779
bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
1780
bch2_check_stripe_to_lru_refs(c);
1781
1782
bch2_bkey_buf_exit(&last_flushed, c);
1783
bch_err_fn(c, ret);
1784
return ret;
1785
}
1786
1787
static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
1788
{
1789
struct bch_fs *c = ca->fs;
1790
int ret;
1791
1792
mutex_lock(&ca->discard_buckets_in_flight_lock);
1793
struct discard_in_flight *i =
1794
darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
1795
if (i) {
1796
ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
1797
goto out;
1798
}
1799
1800
ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
1801
.in_progress = in_progress,
1802
.bucket = bucket,
1803
}));
1804
out:
1805
mutex_unlock(&ca->discard_buckets_in_flight_lock);
1806
return ret;
1807
}
1808
1809
static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
1810
{
1811
mutex_lock(&ca->discard_buckets_in_flight_lock);
1812
struct discard_in_flight *i =
1813
darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
1814
BUG_ON(!i || !i->in_progress);
1815
1816
darray_remove_item(&ca->discard_buckets_in_flight, i);
1817
mutex_unlock(&ca->discard_buckets_in_flight_lock);
1818
}
1819
1820
struct discard_buckets_state {
1821
u64 seen;
1822
u64 open;
1823
u64 need_journal_commit;
1824
u64 discarded;
1825
};
1826
1827
static int bch2_discard_one_bucket(struct btree_trans *trans,
1828
struct bch_dev *ca,
1829
struct btree_iter *need_discard_iter,
1830
struct bpos *discard_pos_done,
1831
struct discard_buckets_state *s,
1832
bool fastpath)
1833
{
1834
struct bch_fs *c = trans->c;
1835
struct bpos pos = need_discard_iter->pos;
1836
struct btree_iter iter = {};
1837
struct bkey_s_c k;
1838
struct bkey_i_alloc_v4 *a;
1839
struct printbuf buf = PRINTBUF;
1840
bool discard_locked = false;
1841
int ret = 0;
1842
1843
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1844
s->open++;
1845
goto out;
1846
}
1847
1848
u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
1849
pos.inode, pos.offset);
1850
if (seq_ready > c->journal.flushed_seq_ondisk) {
1851
if (seq_ready > c->journal.flushing_seq)
1852
s->need_journal_commit++;
1853
goto out;
1854
}
1855
1856
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1857
need_discard_iter->pos,
1858
BTREE_ITER_cached);
1859
ret = bkey_err(k);
1860
if (ret)
1861
goto out;
1862
1863
a = bch2_alloc_to_v4_mut(trans, k);
1864
ret = PTR_ERR_OR_ZERO(a);
1865
if (ret)
1866
goto out;
1867
1868
if (a->v.data_type != BCH_DATA_need_discard) {
1869
if (need_discard_or_freespace_err(trans, k, true, true, true)) {
1870
ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
1871
if (ret)
1872
goto out;
1873
goto commit;
1874
}
1875
1876
goto out;
1877
}
1878
1879
if (!fastpath) {
1880
if (discard_in_flight_add(ca, iter.pos.offset, true))
1881
goto out;
1882
1883
discard_locked = true;
1884
}
1885
1886
if (!bkey_eq(*discard_pos_done, iter.pos)) {
1887
s->discarded++;
1888
*discard_pos_done = iter.pos;
1889
1890
if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) {
1891
/*
1892
* This works without any other locks because this is the only
1893
* thread that removes items from the need_discard tree
1894
*/
1895
bch2_trans_unlock_long(trans);
1896
blkdev_issue_discard(ca->disk_sb.bdev,
1897
k.k->p.offset * ca->mi.bucket_size,
1898
ca->mi.bucket_size,
1899
GFP_KERNEL);
1900
ret = bch2_trans_relock_notrace(trans);
1901
if (ret)
1902
goto out;
1903
}
1904
}
1905
1906
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1907
alloc_data_type_set(&a->v, a->v.data_type);
1908
1909
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1910
if (ret)
1911
goto out;
1912
commit:
1913
ret = bch2_trans_commit(trans, NULL, NULL,
1914
BCH_WATERMARK_btree|
1915
BCH_TRANS_COMMIT_no_enospc);
1916
if (ret)
1917
goto out;
1918
1919
if (!fastpath)
1920
count_event(c, bucket_discard);
1921
else
1922
count_event(c, bucket_discard_fast);
1923
out:
1924
fsck_err:
1925
if (discard_locked)
1926
discard_in_flight_remove(ca, iter.pos.offset);
1927
if (!ret)
1928
s->seen++;
1929
bch2_trans_iter_exit(trans, &iter);
1930
printbuf_exit(&buf);
1931
return ret;
1932
}
1933
1934
static void bch2_do_discards_work(struct work_struct *work)
1935
{
1936
struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
1937
struct bch_fs *c = ca->fs;
1938
struct discard_buckets_state s = {};
1939
struct bpos discard_pos_done = POS_MAX;
1940
int ret;
1941
1942
/*
1943
* We're doing the commit in bch2_discard_one_bucket instead of using
1944
* for_each_btree_key_commit() so that we can increment counters after
1945
* successful commit:
1946
*/
1947
ret = bch2_trans_run(c,
1948
for_each_btree_key_max(trans, iter,
1949
BTREE_ID_need_discard,
1950
POS(ca->dev_idx, 0),
1951
POS(ca->dev_idx, U64_MAX), 0, k,
1952
bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
1953
1954
if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
1955
bch2_journal_flush_async(&c->journal, NULL);
1956
1957
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
1958
bch2_err_str(ret));
1959
1960
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
1961
enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
1962
}
1963
1964
void bch2_dev_do_discards(struct bch_dev *ca)
1965
{
1966
struct bch_fs *c = ca->fs;
1967
1968
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard))
1969
return;
1970
1971
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
1972
goto put_write_ref;
1973
1974
if (queue_work(c->write_ref_wq, &ca->discard_work))
1975
return;
1976
1977
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
1978
put_write_ref:
1979
enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
1980
}
1981
1982
void bch2_do_discards(struct bch_fs *c)
1983
{
1984
for_each_member_device(c, ca)
1985
bch2_dev_do_discards(ca);
1986
}
1987
1988
static int bch2_do_discards_fast_one(struct btree_trans *trans,
1989
struct bch_dev *ca,
1990
u64 bucket,
1991
struct bpos *discard_pos_done,
1992
struct discard_buckets_state *s)
1993
{
1994
struct btree_iter need_discard_iter;
1995
struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
1996
BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
1997
int ret = bkey_err(discard_k);
1998
if (ret)
1999
return ret;
2000
2001
if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
2002
trans, discarding_bucket_not_in_need_discard_btree,
2003
"attempting to discard bucket %u:%llu not in need_discard btree",
2004
ca->dev_idx, bucket))
2005
goto out;
2006
2007
ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
2008
out:
2009
fsck_err:
2010
bch2_trans_iter_exit(trans, &need_discard_iter);
2011
return ret;
2012
}
2013
2014
static void bch2_do_discards_fast_work(struct work_struct *work)
2015
{
2016
struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
2017
struct bch_fs *c = ca->fs;
2018
struct discard_buckets_state s = {};
2019
struct bpos discard_pos_done = POS_MAX;
2020
struct btree_trans *trans = bch2_trans_get(c);
2021
int ret = 0;
2022
2023
while (1) {
2024
bool got_bucket = false;
2025
u64 bucket;
2026
2027
mutex_lock(&ca->discard_buckets_in_flight_lock);
2028
darray_for_each(ca->discard_buckets_in_flight, i) {
2029
if (i->in_progress)
2030
continue;
2031
2032
got_bucket = true;
2033
bucket = i->bucket;
2034
i->in_progress = true;
2035
break;
2036
}
2037
mutex_unlock(&ca->discard_buckets_in_flight_lock);
2038
2039
if (!got_bucket)
2040
break;
2041
2042
ret = lockrestart_do(trans,
2043
bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
2044
bch_err_fn(c, ret);
2045
2046
discard_in_flight_remove(ca, bucket);
2047
2048
if (ret)
2049
break;
2050
}
2051
2052
trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
2053
2054
bch2_trans_put(trans);
2055
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
2056
enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
2057
}
2058
2059
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
2060
{
2061
struct bch_fs *c = ca->fs;
2062
2063
if (discard_in_flight_add(ca, bucket, false))
2064
return;
2065
2066
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast))
2067
return;
2068
2069
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast))
2070
goto put_ref;
2071
2072
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
2073
return;
2074
2075
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
2076
put_ref:
2077
enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
2078
}
2079
2080
static int invalidate_one_bp(struct btree_trans *trans,
2081
struct bch_dev *ca,
2082
struct bkey_s_c_backpointer bp,
2083
struct bkey_buf *last_flushed)
2084
{
2085
struct btree_iter extent_iter;
2086
struct bkey_s_c extent_k =
2087
bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
2088
int ret = bkey_err(extent_k);
2089
if (ret)
2090
return ret;
2091
2092
if (!extent_k.k)
2093
return 0;
2094
2095
struct bkey_i *n =
2096
bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
2097
BTREE_UPDATE_internal_snapshot_node);
2098
ret = PTR_ERR_OR_ZERO(n);
2099
if (ret)
2100
goto err;
2101
2102
bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
2103
err:
2104
bch2_trans_iter_exit(trans, &extent_iter);
2105
return ret;
2106
}
2107
2108
static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
2109
struct bch_dev *ca,
2110
struct bpos bucket,
2111
u8 gen,
2112
struct bkey_buf *last_flushed)
2113
{
2114
struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
2115
struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
2116
2117
return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
2118
bp_start, bp_end, 0, k,
2119
NULL, NULL,
2120
BCH_WATERMARK_btree|
2121
BCH_TRANS_COMMIT_no_enospc, ({
2122
if (k.k->type != KEY_TYPE_backpointer)
2123
continue;
2124
2125
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
2126
2127
if (bp.v->bucket_gen != gen)
2128
continue;
2129
2130
/* filter out bps with gens that don't match */
2131
2132
invalidate_one_bp(trans, ca, bp, last_flushed);
2133
}));
2134
}
2135
2136
noinline_for_stack
2137
static int invalidate_one_bucket(struct btree_trans *trans,
2138
struct bch_dev *ca,
2139
struct btree_iter *lru_iter,
2140
struct bkey_s_c lru_k,
2141
struct bkey_buf *last_flushed,
2142
s64 *nr_to_invalidate)
2143
{
2144
struct bch_fs *c = trans->c;
2145
struct printbuf buf = PRINTBUF;
2146
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
2147
struct btree_iter alloc_iter = {};
2148
int ret = 0;
2149
2150
if (*nr_to_invalidate <= 0)
2151
return 1;
2152
2153
if (!bch2_dev_bucket_exists(c, bucket)) {
2154
if (fsck_err(trans, lru_entry_to_invalid_bucket,
2155
"lru key points to nonexistent device:bucket %llu:%llu",
2156
bucket.inode, bucket.offset))
2157
return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
2158
goto out;
2159
}
2160
2161
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
2162
return 0;
2163
2164
struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
2165
BTREE_ID_alloc, bucket,
2166
BTREE_ITER_cached);
2167
ret = bkey_err(alloc_k);
2168
if (ret)
2169
return ret;
2170
2171
struct bch_alloc_v4 a_convert;
2172
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
2173
2174
/* We expect harmless races here due to the btree write buffer: */
2175
if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
2176
goto out;
2177
2178
/*
2179
* Impossible since alloc_lru_idx_read() only returns nonzero if the
2180
* bucket is supposed to be on the cached bucket LRU (i.e.
2181
* BCH_DATA_cached)
2182
*
2183
* bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
2184
*/
2185
BUG_ON(a->data_type != BCH_DATA_cached);
2186
BUG_ON(a->dirty_sectors);
2187
2188
if (!a->cached_sectors) {
2189
bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset,
2190
true, last_flushed);
2191
goto out;
2192
}
2193
2194
unsigned cached_sectors = a->cached_sectors;
2195
u8 gen = a->gen;
2196
2197
ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
2198
if (ret)
2199
goto out;
2200
2201
trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
2202
--*nr_to_invalidate;
2203
out:
2204
fsck_err:
2205
bch2_trans_iter_exit(trans, &alloc_iter);
2206
printbuf_exit(&buf);
2207
return ret;
2208
}
2209
2210
static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
2211
struct bch_dev *ca, bool *wrapped)
2212
{
2213
struct bkey_s_c k;
2214
again:
2215
k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
2216
if (!k.k && !*wrapped) {
2217
bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
2218
*wrapped = true;
2219
goto again;
2220
}
2221
2222
return k;
2223
}
2224
2225
static void bch2_do_invalidates_work(struct work_struct *work)
2226
{
2227
struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
2228
struct bch_fs *c = ca->fs;
2229
struct btree_trans *trans = bch2_trans_get(c);
2230
int ret = 0;
2231
2232
struct bkey_buf last_flushed;
2233
bch2_bkey_buf_init(&last_flushed);
2234
bkey_init(&last_flushed.k->k);
2235
2236
ret = bch2_btree_write_buffer_tryflush(trans);
2237
if (ret)
2238
goto err;
2239
2240
s64 nr_to_invalidate =
2241
should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
2242
struct btree_iter iter;
2243
bool wrapped = false;
2244
2245
bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
2246
lru_pos(ca->dev_idx, 0,
2247
((bch2_current_io_time(c, READ) + U32_MAX) &
2248
LRU_TIME_MAX)), 0);
2249
2250
while (true) {
2251
bch2_trans_begin(trans);
2252
2253
struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
2254
ret = bkey_err(k);
2255
if (ret)
2256
goto restart_err;
2257
if (!k.k)
2258
break;
2259
2260
ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
2261
restart_err:
2262
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2263
continue;
2264
if (ret)
2265
break;
2266
2267
bch2_btree_iter_advance(trans, &iter);
2268
}
2269
bch2_trans_iter_exit(trans, &iter);
2270
err:
2271
bch2_trans_put(trans);
2272
bch2_bkey_buf_exit(&last_flushed, c);
2273
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
2274
enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
2275
}
2276
2277
void bch2_dev_do_invalidates(struct bch_dev *ca)
2278
{
2279
struct bch_fs *c = ca->fs;
2280
2281
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate))
2282
return;
2283
2284
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates))
2285
goto put_ref;
2286
2287
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
2288
return;
2289
2290
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
2291
put_ref:
2292
enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
2293
}
2294
2295
void bch2_do_invalidates(struct bch_fs *c)
2296
{
2297
for_each_member_device(c, ca)
2298
bch2_dev_do_invalidates(ca);
2299
}
2300
2301
int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
2302
u64 bucket_start, u64 bucket_end)
2303
{
2304
struct btree_trans *trans = bch2_trans_get(c);
2305
struct btree_iter iter;
2306
struct bkey_s_c k;
2307
struct bkey hole;
2308
struct bpos end = POS(ca->dev_idx, bucket_end);
2309
struct bch_member *m;
2310
unsigned long last_updated = jiffies;
2311
int ret;
2312
2313
BUG_ON(bucket_start > bucket_end);
2314
BUG_ON(bucket_end > ca->mi.nbuckets);
2315
2316
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2317
POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
2318
BTREE_ITER_prefetch);
2319
/*
2320
* Scan the alloc btree for every bucket on @ca, and add buckets to the
2321
* freespace/need_discard/need_gc_gens btrees as needed:
2322
*/
2323
while (1) {
2324
if (time_after(jiffies, last_updated + HZ * 10)) {
2325
bch_info(ca, "%s: currently at %llu/%llu",
2326
__func__, iter.pos.offset, ca->mi.nbuckets);
2327
last_updated = jiffies;
2328
}
2329
2330
bch2_trans_begin(trans);
2331
2332
if (bkey_ge(iter.pos, end)) {
2333
ret = 0;
2334
break;
2335
}
2336
2337
k = bch2_get_key_or_hole(trans, &iter, end, &hole);
2338
ret = bkey_err(k);
2339
if (ret)
2340
goto bkey_err;
2341
2342
if (k.k->type) {
2343
/*
2344
* We process live keys in the alloc btree one at a
2345
* time:
2346
*/
2347
struct bch_alloc_v4 a_convert;
2348
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
2349
2350
ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
2351
bch2_trans_commit(trans, NULL, NULL,
2352
BCH_TRANS_COMMIT_no_enospc);
2353
if (ret)
2354
goto bkey_err;
2355
2356
bch2_btree_iter_advance(trans, &iter);
2357
} else {
2358
struct bkey_i *freespace;
2359
2360
freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
2361
ret = PTR_ERR_OR_ZERO(freespace);
2362
if (ret)
2363
goto bkey_err;
2364
2365
bkey_init(&freespace->k);
2366
freespace->k.type = KEY_TYPE_set;
2367
freespace->k.p = k.k->p;
2368
freespace->k.size = k.k->size;
2369
2370
ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
2371
bch2_trans_commit(trans, NULL, NULL,
2372
BCH_TRANS_COMMIT_no_enospc);
2373
if (ret)
2374
goto bkey_err;
2375
2376
bch2_btree_iter_set_pos(trans, &iter, k.k->p);
2377
}
2378
bkey_err:
2379
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2380
continue;
2381
if (ret)
2382
break;
2383
}
2384
2385
bch2_trans_iter_exit(trans, &iter);
2386
bch2_trans_put(trans);
2387
2388
if (ret < 0) {
2389
bch_err_msg(ca, ret, "initializing free space");
2390
return ret;
2391
}
2392
2393
mutex_lock(&c->sb_lock);
2394
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2395
SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
2396
mutex_unlock(&c->sb_lock);
2397
2398
return 0;
2399
}
2400
2401
int bch2_fs_freespace_init(struct bch_fs *c)
2402
{
2403
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
2404
return 0;
2405
2406
2407
/*
2408
* We can crash during the device add path, so we need to check this on
2409
* every mount:
2410
*/
2411
2412
bool doing_init = false;
2413
for_each_member_device(c, ca) {
2414
if (ca->mi.freespace_initialized)
2415
continue;
2416
2417
if (!doing_init) {
2418
bch_info(c, "initializing freespace");
2419
doing_init = true;
2420
}
2421
2422
int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
2423
if (ret) {
2424
bch2_dev_put(ca);
2425
bch_err_fn(c, ret);
2426
return ret;
2427
}
2428
}
2429
2430
if (doing_init) {
2431
mutex_lock(&c->sb_lock);
2432
bch2_write_super(c);
2433
mutex_unlock(&c->sb_lock);
2434
bch_verbose(c, "done initializing freespace");
2435
}
2436
2437
return 0;
2438
}
2439
2440
/* device removal */
2441
2442
int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
2443
{
2444
struct bpos start = POS(ca->dev_idx, 0);
2445
struct bpos end = POS(ca->dev_idx, U64_MAX);
2446
int ret;
2447
2448
/*
2449
* We clear the LRU and need_discard btrees first so that we don't race
2450
* with bch2_do_invalidates() and bch2_do_discards()
2451
*/
2452
ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
2453
BTREE_TRIGGER_norun, NULL) ?:
2454
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
2455
BTREE_TRIGGER_norun, NULL) ?:
2456
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
2457
BTREE_TRIGGER_norun, NULL) ?:
2458
bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
2459
BTREE_TRIGGER_norun, NULL) ?:
2460
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
2461
BTREE_TRIGGER_norun, NULL) ?:
2462
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
2463
BTREE_TRIGGER_norun, NULL) ?:
2464
bch2_dev_usage_remove(c, ca->dev_idx);
2465
bch_err_msg(ca, ret, "removing dev alloc info");
2466
return ret;
2467
}
2468
2469
/* Bucket IO clocks: */
2470
2471
static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2472
size_t bucket_nr, int rw)
2473
{
2474
struct bch_fs *c = trans->c;
2475
2476
struct btree_iter iter;
2477
struct bkey_i_alloc_v4 *a =
2478
bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
2479
int ret = PTR_ERR_OR_ZERO(a);
2480
if (ret)
2481
return ret;
2482
2483
u64 now = bch2_current_io_time(c, rw);
2484
if (a->v.io_time[rw] == now)
2485
goto out;
2486
2487
a->v.io_time[rw] = now;
2488
2489
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2490
bch2_trans_commit(trans, NULL, NULL, 0);
2491
out:
2492
bch2_trans_iter_exit(trans, &iter);
2493
return ret;
2494
}
2495
2496
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2497
size_t bucket_nr, int rw)
2498
{
2499
if (bch2_trans_relock(trans))
2500
bch2_trans_begin(trans);
2501
2502
return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
2503
}
2504
2505
/* Startup/shutdown (ro/rw): */
2506
2507
void bch2_recalc_capacity(struct bch_fs *c)
2508
{
2509
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2510
unsigned bucket_size_max = 0;
2511
unsigned long ra_pages = 0;
2512
2513
lockdep_assert_held(&c->state_lock);
2514
2515
guard(rcu)();
2516
for_each_member_device_rcu(c, ca, NULL) {
2517
struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
2518
if (bdev)
2519
ra_pages += bdev->bd_disk->bdi->ra_pages;
2520
2521
if (ca->mi.state != BCH_MEMBER_STATE_rw)
2522
continue;
2523
2524
u64 dev_reserve = 0;
2525
2526
/*
2527
* We need to reserve buckets (from the number
2528
* of currently available buckets) against
2529
* foreground writes so that mainly copygc can
2530
* make forward progress.
2531
*
2532
* We need enough to refill the various reserves
2533
* from scratch - copygc will use its entire
2534
* reserve all at once, then run against when
2535
* its reserve is refilled (from the formerly
2536
* available buckets).
2537
*
2538
* This reserve is just used when considering if
2539
* allocations for foreground writes must wait -
2540
* not -ENOSPC calculations.
2541
*/
2542
2543
dev_reserve += ca->nr_btree_reserve * 2;
2544
dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2545
2546
dev_reserve += 1; /* btree write point */
2547
dev_reserve += 1; /* copygc write point */
2548
dev_reserve += 1; /* rebalance write point */
2549
2550
dev_reserve *= ca->mi.bucket_size;
2551
2552
capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2553
ca->mi.first_bucket);
2554
2555
reserved_sectors += dev_reserve * 2;
2556
2557
bucket_size_max = max_t(unsigned, bucket_size_max,
2558
ca->mi.bucket_size);
2559
}
2560
2561
bch2_set_ra_pages(c, ra_pages);
2562
2563
gc_reserve = c->opts.gc_reserve_bytes
2564
? c->opts.gc_reserve_bytes >> 9
2565
: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2566
2567
reserved_sectors = max(gc_reserve, reserved_sectors);
2568
2569
reserved_sectors = min(reserved_sectors, capacity);
2570
2571
c->reserved = reserved_sectors;
2572
c->capacity = capacity - reserved_sectors;
2573
2574
c->bucket_size_max = bucket_size_max;
2575
2576
/* Wake up case someone was waiting for buckets */
2577
closure_wake_up(&c->freelist_wait);
2578
}
2579
2580
u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2581
{
2582
u64 ret = U64_MAX;
2583
2584
guard(rcu)();
2585
for_each_rw_member_rcu(c, ca)
2586
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2587
return ret;
2588
}
2589
2590
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2591
{
2592
struct open_bucket *ob;
2593
2594
for (ob = c->open_buckets;
2595
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2596
ob++) {
2597
scoped_guard(spinlock, &ob->lock) {
2598
if (ob->valid && !ob->on_partial_list &&
2599
ob->dev == ca->dev_idx)
2600
return true;
2601
}
2602
}
2603
2604
return false;
2605
}
2606
2607
void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw)
2608
{
2609
/* BCH_DATA_free == all rw devs */
2610
2611
for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2612
if (rw &&
2613
(i == BCH_DATA_free ||
2614
(ca->mi.data_allowed & BIT(i))))
2615
set_bit(ca->dev_idx, c->rw_devs[i].d);
2616
else
2617
clear_bit(ca->dev_idx, c->rw_devs[i].d);
2618
}
2619
2620
/* device goes ro: */
2621
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2622
{
2623
lockdep_assert_held(&c->state_lock);
2624
2625
/* First, remove device from allocation groups: */
2626
bch2_dev_allocator_set_rw(c, ca, false);
2627
2628
c->rw_devs_change_count++;
2629
2630
/*
2631
* Capacity is calculated based off of devices in allocation groups:
2632
*/
2633
bch2_recalc_capacity(c);
2634
2635
bch2_open_buckets_stop(c, ca, false);
2636
2637
/*
2638
* Wake up threads that were blocked on allocation, so they can notice
2639
* the device can no longer be removed and the capacity has changed:
2640
*/
2641
closure_wake_up(&c->freelist_wait);
2642
2643
/*
2644
* journal_res_get() can block waiting for free space in the journal -
2645
* it needs to notice there may not be devices to allocate from anymore:
2646
*/
2647
wake_up(&c->journal.wait);
2648
2649
/* Now wait for any in flight writes: */
2650
2651
closure_wait_event(&c->open_buckets_wait,
2652
!bch2_dev_has_open_write_point(c, ca));
2653
}
2654
2655
/* device goes rw: */
2656
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2657
{
2658
lockdep_assert_held(&c->state_lock);
2659
2660
bch2_dev_allocator_set_rw(c, ca, true);
2661
c->rw_devs_change_count++;
2662
}
2663
2664
void bch2_dev_allocator_background_exit(struct bch_dev *ca)
2665
{
2666
darray_exit(&ca->discard_buckets_in_flight);
2667
}
2668
2669
void bch2_dev_allocator_background_init(struct bch_dev *ca)
2670
{
2671
mutex_init(&ca->discard_buckets_in_flight_lock);
2672
INIT_WORK(&ca->discard_work, bch2_do_discards_work);
2673
INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
2674
INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
2675
}
2676
2677
void bch2_fs_allocator_background_init(struct bch_fs *c)
2678
{
2679
spin_lock_init(&c->freelist_lock);
2680
}
2681
2682