Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/bcachefs/alloc_foreground.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright 2012 Google, Inc.
4
*
5
* Foreground allocator code: allocate buckets from freelist, and allocate in
6
* sector granularity from writepoints.
7
*
8
* bch2_bucket_alloc() allocates a single bucket from a specific device.
9
*
10
* bch2_bucket_alloc_set() allocates one or more buckets from different devices
11
* in a given filesystem.
12
*/
13
14
#include "bcachefs.h"
15
#include "alloc_background.h"
16
#include "alloc_foreground.h"
17
#include "backpointers.h"
18
#include "btree_iter.h"
19
#include "btree_update.h"
20
#include "btree_gc.h"
21
#include "buckets.h"
22
#include "buckets_waiting_for_journal.h"
23
#include "clock.h"
24
#include "debug.h"
25
#include "disk_groups.h"
26
#include "ec.h"
27
#include "error.h"
28
#include "io_write.h"
29
#include "journal.h"
30
#include "movinggc.h"
31
#include "nocow_locking.h"
32
#include "trace.h"
33
34
#include <linux/math64.h>
35
#include <linux/rculist.h>
36
#include <linux/rcupdate.h>
37
38
static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39
struct mutex *lock)
40
{
41
if (!mutex_trylock(lock)) {
42
bch2_trans_unlock(trans);
43
mutex_lock(lock);
44
}
45
}
46
47
const char * const bch2_watermarks[] = {
48
#define x(t) #t,
49
BCH_WATERMARKS()
50
#undef x
51
NULL
52
};
53
54
/*
55
* Open buckets represent a bucket that's currently being allocated from. They
56
* serve two purposes:
57
*
58
* - They track buckets that have been partially allocated, allowing for
59
* sub-bucket sized allocations - they're used by the sector allocator below
60
*
61
* - They provide a reference to the buckets they own that mark and sweep GC
62
* can find, until the new allocation has a pointer to it inserted into the
63
* btree
64
*
65
* When allocating some space with the sector allocator, the allocation comes
66
* with a reference to an open bucket - the caller is required to put that
67
* reference _after_ doing the index update that makes its allocation reachable.
68
*/
69
70
void bch2_reset_alloc_cursors(struct bch_fs *c)
71
{
72
guard(rcu)();
73
for_each_member_device_rcu(c, ca, NULL)
74
memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
75
}
76
77
static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
78
{
79
open_bucket_idx_t idx = ob - c->open_buckets;
80
open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
81
82
ob->hash = *slot;
83
*slot = idx;
84
}
85
86
static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
87
{
88
open_bucket_idx_t idx = ob - c->open_buckets;
89
open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
90
91
while (*slot != idx) {
92
BUG_ON(!*slot);
93
slot = &c->open_buckets[*slot].hash;
94
}
95
96
*slot = ob->hash;
97
ob->hash = 0;
98
}
99
100
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
101
{
102
struct bch_dev *ca = ob_dev(c, ob);
103
104
if (ob->ec) {
105
ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
106
return;
107
}
108
109
spin_lock(&ob->lock);
110
ob->valid = false;
111
ob->data_type = 0;
112
spin_unlock(&ob->lock);
113
114
spin_lock(&c->freelist_lock);
115
bch2_open_bucket_hash_remove(c, ob);
116
117
ob->freelist = c->open_buckets_freelist;
118
c->open_buckets_freelist = ob - c->open_buckets;
119
120
c->open_buckets_nr_free++;
121
ca->nr_open_buckets--;
122
spin_unlock(&c->freelist_lock);
123
124
closure_wake_up(&c->open_buckets_wait);
125
}
126
127
void bch2_open_bucket_write_error(struct bch_fs *c,
128
struct open_buckets *obs,
129
unsigned dev, int err)
130
{
131
struct open_bucket *ob;
132
unsigned i;
133
134
open_bucket_for_each(c, obs, ob, i)
135
if (ob->dev == dev && ob->ec)
136
bch2_ec_bucket_cancel(c, ob, err);
137
}
138
139
static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
140
{
141
struct open_bucket *ob;
142
143
BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
144
145
ob = c->open_buckets + c->open_buckets_freelist;
146
c->open_buckets_freelist = ob->freelist;
147
atomic_set(&ob->pin, 1);
148
ob->data_type = 0;
149
150
c->open_buckets_nr_free--;
151
return ob;
152
}
153
154
static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
155
{
156
if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs))
157
return false;
158
159
return bch2_is_superblock_bucket(ca, b);
160
}
161
162
static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
163
{
164
BUG_ON(c->open_buckets_partial_nr >=
165
ARRAY_SIZE(c->open_buckets_partial));
166
167
spin_lock(&c->freelist_lock);
168
scoped_guard(rcu)
169
bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
170
171
ob->on_partial_list = true;
172
c->open_buckets_partial[c->open_buckets_partial_nr++] =
173
ob - c->open_buckets;
174
spin_unlock(&c->freelist_lock);
175
176
closure_wake_up(&c->open_buckets_wait);
177
closure_wake_up(&c->freelist_wait);
178
}
179
180
static inline bool may_alloc_bucket(struct bch_fs *c,
181
struct alloc_request *req,
182
struct bpos bucket)
183
{
184
if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
185
req->counters.skipped_open++;
186
return false;
187
}
188
189
u64 journal_seq_ready =
190
bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
191
bucket.inode, bucket.offset);
192
if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
193
if (journal_seq_ready > c->journal.flushing_seq)
194
req->counters.need_journal_commit++;
195
req->counters.skipped_need_journal_commit++;
196
return false;
197
}
198
199
if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
200
req->counters.skipped_nocow++;
201
return false;
202
}
203
204
return true;
205
}
206
207
static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
208
struct alloc_request *req,
209
u64 bucket, u8 gen,
210
struct closure *cl)
211
{
212
struct bch_dev *ca = req->ca;
213
214
if (unlikely(is_superblock_bucket(c, ca, bucket)))
215
return NULL;
216
217
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
218
req->counters.skipped_nouse++;
219
return NULL;
220
}
221
222
spin_lock(&c->freelist_lock);
223
224
if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
225
if (cl)
226
closure_wait(&c->open_buckets_wait, cl);
227
228
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
229
spin_unlock(&c->freelist_lock);
230
return ERR_PTR(bch_err_throw(c, open_buckets_empty));
231
}
232
233
/* Recheck under lock: */
234
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
235
spin_unlock(&c->freelist_lock);
236
req->counters.skipped_open++;
237
return NULL;
238
}
239
240
struct open_bucket *ob = bch2_open_bucket_alloc(c);
241
242
spin_lock(&ob->lock);
243
ob->valid = true;
244
ob->sectors_free = ca->mi.bucket_size;
245
ob->dev = ca->dev_idx;
246
ob->gen = gen;
247
ob->bucket = bucket;
248
spin_unlock(&ob->lock);
249
250
ca->nr_open_buckets++;
251
bch2_open_bucket_hash_add(c, ob);
252
253
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
254
track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
255
256
spin_unlock(&c->freelist_lock);
257
return ob;
258
}
259
260
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
261
struct alloc_request *req,
262
struct btree_iter *freespace_iter,
263
struct closure *cl)
264
{
265
struct bch_fs *c = trans->c;
266
u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
267
268
if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b)))
269
return NULL;
270
271
u8 gen;
272
int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
273
if (ret < 0)
274
return ERR_PTR(ret);
275
if (ret)
276
return NULL;
277
278
return __try_alloc_bucket(c, req, b, gen, cl);
279
}
280
281
/*
282
* This path is for before the freespace btree is initialized:
283
*/
284
static noinline struct open_bucket *
285
bch2_bucket_alloc_early(struct btree_trans *trans,
286
struct alloc_request *req,
287
struct closure *cl)
288
{
289
struct bch_fs *c = trans->c;
290
struct bch_dev *ca = req->ca;
291
struct btree_iter iter, citer;
292
struct bkey_s_c k, ck;
293
struct open_bucket *ob = NULL;
294
u64 first_bucket = ca->mi.first_bucket;
295
u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
296
u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
297
u64 alloc_cursor = alloc_start;
298
int ret;
299
300
/*
301
* Scan with an uncached iterator to avoid polluting the key cache. An
302
* uncached iter will return a cached key if one exists, but if not
303
* there is no other underlying protection for the associated key cache
304
* slot. To avoid racing bucket allocations, look up the cached key slot
305
* of any likely allocation candidate before attempting to proceed with
306
* the allocation. This provides proper exclusion on the associated
307
* bucket.
308
*/
309
again:
310
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
311
BTREE_ITER_slots, k, ret) {
312
u64 bucket = k.k->p.offset;
313
314
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
315
break;
316
317
if (req->btree_bitmap != BTREE_BITMAP_ANY &&
318
req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
319
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
320
if (req->btree_bitmap == BTREE_BITMAP_YES &&
321
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
322
break;
323
324
bucket = sector_to_bucket(ca,
325
round_up(bucket_to_sector(ca, bucket) + 1,
326
1ULL << ca->mi.btree_bitmap_shift));
327
bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
328
req->counters.buckets_seen++;
329
req->counters.skipped_mi_btree_bitmap++;
330
continue;
331
}
332
333
struct bch_alloc_v4 a_convert;
334
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
335
if (a->data_type != BCH_DATA_free)
336
continue;
337
338
/* now check the cached key to serialize concurrent allocs of the bucket */
339
ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
340
ret = bkey_err(ck);
341
if (ret)
342
break;
343
344
a = bch2_alloc_to_v4(ck, &a_convert);
345
if (a->data_type != BCH_DATA_free)
346
goto next;
347
348
req->counters.buckets_seen++;
349
350
ob = may_alloc_bucket(c, req, k.k->p)
351
? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
352
: NULL;
353
next:
354
bch2_set_btree_iter_dontneed(trans, &citer);
355
bch2_trans_iter_exit(trans, &citer);
356
if (ob)
357
break;
358
}
359
bch2_trans_iter_exit(trans, &iter);
360
361
alloc_cursor = iter.pos.offset;
362
363
if (!ob && ret)
364
ob = ERR_PTR(ret);
365
366
if (!ob && alloc_start > first_bucket) {
367
alloc_cursor = alloc_start = first_bucket;
368
goto again;
369
}
370
371
*dev_alloc_cursor = alloc_cursor;
372
373
return ob;
374
}
375
376
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
377
struct alloc_request *req,
378
struct closure *cl)
379
{
380
struct bch_dev *ca = req->ca;
381
struct btree_iter iter;
382
struct bkey_s_c k;
383
struct open_bucket *ob = NULL;
384
u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
385
u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
386
u64 alloc_cursor = alloc_start;
387
int ret;
388
again:
389
for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
390
POS(ca->dev_idx, alloc_cursor),
391
POS(ca->dev_idx, U64_MAX),
392
0, k, ret) {
393
/*
394
* peek normally dosen't trim extents - they can span iter.pos,
395
* which is not what we want here:
396
*/
397
iter.k.size = iter.k.p.offset - iter.pos.offset;
398
399
while (iter.k.size) {
400
req->counters.buckets_seen++;
401
402
u64 bucket = iter.pos.offset & ~(~0ULL << 56);
403
if (req->btree_bitmap != BTREE_BITMAP_ANY &&
404
req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
405
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
406
if (req->btree_bitmap == BTREE_BITMAP_YES &&
407
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
408
goto fail;
409
410
bucket = sector_to_bucket(ca,
411
round_up(bucket_to_sector(ca, bucket + 1),
412
1ULL << ca->mi.btree_bitmap_shift));
413
alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
414
415
bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
416
req->counters.skipped_mi_btree_bitmap++;
417
goto next;
418
}
419
420
ob = try_alloc_bucket(trans, req, &iter, cl);
421
if (ob) {
422
if (!IS_ERR(ob))
423
*dev_alloc_cursor = iter.pos.offset;
424
bch2_set_btree_iter_dontneed(trans, &iter);
425
break;
426
}
427
428
iter.k.size--;
429
iter.pos.offset++;
430
}
431
next:
432
if (ob || ret)
433
break;
434
}
435
fail:
436
bch2_trans_iter_exit(trans, &iter);
437
438
BUG_ON(ob && ret);
439
440
if (ret)
441
ob = ERR_PTR(ret);
442
443
if (!ob && alloc_start > ca->mi.first_bucket) {
444
alloc_cursor = alloc_start = ca->mi.first_bucket;
445
goto again;
446
}
447
448
return ob;
449
}
450
451
static noinline void trace_bucket_alloc2(struct bch_fs *c,
452
struct alloc_request *req,
453
struct closure *cl,
454
struct open_bucket *ob)
455
{
456
struct printbuf buf = PRINTBUF;
457
458
printbuf_tabstop_push(&buf, 24);
459
460
prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx);
461
prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]);
462
prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]);
463
prt_printf(&buf, "blocking\t%u\n", cl != NULL);
464
prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]);
465
prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark));
466
prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
467
bch2_copygc_wait_amount(c),
468
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
469
prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen);
470
prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open);
471
prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit);
472
prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow);
473
prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse);
474
prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap);
475
476
if (!IS_ERR(ob)) {
477
prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
478
trace_bucket_alloc(c, buf.buf);
479
} else {
480
prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
481
trace_bucket_alloc_fail(c, buf.buf);
482
}
483
484
printbuf_exit(&buf);
485
}
486
487
/**
488
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
489
* @trans: transaction object
490
* @req: state for the entire allocation
491
* @cl: if not NULL, closure to be used to wait if buckets not available
492
* @nowait: if true, do not wait for buckets to become available
493
*
494
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
495
*/
496
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
497
struct alloc_request *req,
498
struct closure *cl,
499
bool nowait)
500
{
501
struct bch_fs *c = trans->c;
502
struct bch_dev *ca = req->ca;
503
struct open_bucket *ob = NULL;
504
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
505
u64 avail;
506
bool waiting = nowait;
507
508
req->btree_bitmap = req->data_type == BCH_DATA_btree;
509
memset(&req->counters, 0, sizeof(req->counters));
510
again:
511
bch2_dev_usage_read_fast(ca, &req->usage);
512
avail = dev_buckets_free(ca, req->usage, req->watermark);
513
514
if (req->usage.buckets[BCH_DATA_need_discard] >
515
min(avail, ca->mi.nbuckets >> 7))
516
bch2_dev_do_discards(ca);
517
518
if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
519
bch2_gc_gens_async(c);
520
521
if (should_invalidate_buckets(ca, req->usage))
522
bch2_dev_do_invalidates(ca);
523
524
if (!avail) {
525
if (req->watermark > BCH_WATERMARK_normal &&
526
c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
527
goto alloc;
528
529
if (cl && !waiting) {
530
closure_wait(&c->freelist_wait, cl);
531
waiting = true;
532
goto again;
533
}
534
535
track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
536
537
ob = ERR_PTR(bch_err_throw(c, freelist_empty));
538
goto err;
539
}
540
541
if (waiting)
542
closure_wake_up(&c->freelist_wait);
543
alloc:
544
ob = likely(freespace)
545
? bch2_bucket_alloc_freelist(trans, req, cl)
546
: bch2_bucket_alloc_early(trans, req, cl);
547
548
if (req->counters.need_journal_commit * 2 > avail)
549
bch2_journal_flush_async(&c->journal, NULL);
550
551
if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) {
552
req->btree_bitmap = BTREE_BITMAP_ANY;
553
goto alloc;
554
}
555
556
if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) {
557
freespace = false;
558
goto alloc;
559
}
560
err:
561
if (!ob)
562
ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
563
564
if (!IS_ERR(ob))
565
ob->data_type = req->data_type;
566
567
if (!IS_ERR(ob))
568
count_event(c, bucket_alloc);
569
else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
570
count_event(c, bucket_alloc_fail);
571
572
if (!IS_ERR(ob)
573
? trace_bucket_alloc_enabled()
574
: trace_bucket_alloc_fail_enabled())
575
trace_bucket_alloc2(c, req, cl, ob);
576
577
return ob;
578
}
579
580
struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
581
enum bch_watermark watermark,
582
enum bch_data_type data_type,
583
struct closure *cl)
584
{
585
struct open_bucket *ob;
586
struct alloc_request req = {
587
.watermark = watermark,
588
.data_type = data_type,
589
.ca = ca,
590
};
591
592
bch2_trans_do(c,
593
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
594
return ob;
595
}
596
597
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
598
unsigned l, unsigned r)
599
{
600
return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
601
}
602
603
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
604
605
void bch2_dev_alloc_list(struct bch_fs *c,
606
struct dev_stripe_state *stripe,
607
struct bch_devs_mask *devs,
608
struct dev_alloc_list *ret)
609
{
610
ret->nr = 0;
611
612
unsigned i;
613
for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
614
ret->data[ret->nr++] = i;
615
616
bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
617
}
618
619
static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
620
static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */
621
static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */
622
623
static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
624
{
625
/*
626
* Avoid underflowing clock hands if at all possible, if clock hands go
627
* to 0 then we lose information - clock hands can be in a wide range if
628
* we have devices we rarely try to allocate from, if we generally
629
* allocate from a specified target but only sometimes have to fall back
630
* to the whole filesystem.
631
*/
632
u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */
633
u64 scale_min = 0; /* minumum we must subtract to avoid overflow */
634
635
for (u64 *v = stripe->next_alloc;
636
v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
637
if (*v)
638
scale_max = min(scale_max, *v);
639
if (*v > stripe_clock_hand_max)
640
scale_min = max(scale_min, *v - stripe_clock_hand_max);
641
}
642
643
u64 scale = max(scale_min, scale_max);
644
645
for (u64 *v = stripe->next_alloc;
646
v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
647
*v = *v < scale ? 0 : *v - scale;
648
}
649
650
static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
651
struct dev_stripe_state *stripe,
652
struct bch_dev_usage *usage)
653
{
654
/*
655
* Stripe state has a per device clock hand: we allocate from the device
656
* with the smallest clock hand.
657
*
658
* When we allocate, we don't do a simple increment; we add the inverse
659
* of the device's free space. This results in round robin behavior that
660
* biases in favor of the device(s) with more free space.
661
*/
662
663
u64 *v = stripe->next_alloc + ca->dev_idx;
664
u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
665
u64 free_space_inv = free_space
666
? div64_u64(stripe_clock_hand_inv, free_space)
667
: stripe_clock_hand_inv;
668
669
/* Saturating add, avoid overflow: */
670
u64 sum = *v + free_space_inv;
671
*v = sum >= *v ? sum : U64_MAX;
672
673
if (unlikely(*v > stripe_clock_hand_rescale))
674
bch2_stripe_state_rescale(stripe);
675
}
676
677
void bch2_dev_stripe_increment(struct bch_dev *ca,
678
struct dev_stripe_state *stripe)
679
{
680
struct bch_dev_usage usage;
681
682
bch2_dev_usage_read_fast(ca, &usage);
683
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
684
}
685
686
static int add_new_bucket(struct bch_fs *c,
687
struct alloc_request *req,
688
struct open_bucket *ob)
689
{
690
unsigned durability = ob_dev(c, ob)->mi.durability;
691
692
BUG_ON(req->nr_effective >= req->nr_replicas);
693
694
__clear_bit(ob->dev, req->devs_may_alloc.d);
695
req->nr_effective += durability;
696
req->have_cache |= !durability;
697
698
ob_push(c, &req->ptrs, ob);
699
700
if (req->nr_effective >= req->nr_replicas)
701
return 1;
702
if (ob->ec)
703
return 1;
704
return 0;
705
}
706
707
inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
708
struct alloc_request *req,
709
struct dev_stripe_state *stripe,
710
struct closure *cl)
711
{
712
struct bch_fs *c = trans->c;
713
int ret = 0;
714
715
BUG_ON(req->nr_effective >= req->nr_replicas);
716
717
bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
718
719
darray_for_each(req->devs_sorted, i) {
720
req->ca = bch2_dev_tryget_noerror(c, *i);
721
if (!req->ca)
722
continue;
723
724
if (!req->ca->mi.durability && req->have_cache) {
725
bch2_dev_put(req->ca);
726
continue;
727
}
728
729
struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
730
req->flags & BCH_WRITE_alloc_nowait);
731
if (!IS_ERR(ob))
732
bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
733
bch2_dev_put(req->ca);
734
735
if (IS_ERR(ob)) {
736
ret = PTR_ERR(ob);
737
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
738
break;
739
continue;
740
}
741
742
ret = add_new_bucket(c, req, ob);
743
if (ret)
744
break;
745
}
746
747
if (ret == 1)
748
return 0;
749
if (ret)
750
return ret;
751
return bch_err_throw(c, insufficient_devices);
752
}
753
754
/* Allocate from stripes: */
755
756
/*
757
* if we can't allocate a new stripe because there are already too many
758
* partially filled stripes, force allocating from an existing stripe even when
759
* it's to a device we don't want:
760
*/
761
762
static int bucket_alloc_from_stripe(struct btree_trans *trans,
763
struct alloc_request *req,
764
struct closure *cl)
765
{
766
struct bch_fs *c = trans->c;
767
int ret = 0;
768
769
if (req->nr_replicas < 2)
770
return 0;
771
772
if (ec_open_bucket(c, &req->ptrs))
773
return 0;
774
775
struct ec_stripe_head *h =
776
bch2_ec_stripe_head_get(trans, req, 0, cl);
777
if (IS_ERR(h))
778
return PTR_ERR(h);
779
if (!h)
780
return 0;
781
782
bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
783
784
darray_for_each(req->devs_sorted, i)
785
for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
786
if (!h->s->blocks[ec_idx])
787
continue;
788
789
struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
790
if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
791
ob->ec_idx = ec_idx;
792
ob->ec = h->s;
793
ec_stripe_new_get(h->s, STRIPE_REF_io);
794
795
ret = add_new_bucket(c, req, ob);
796
goto out;
797
}
798
}
799
out:
800
bch2_ec_stripe_head_put(c, h);
801
return ret;
802
}
803
804
/* Sector allocator */
805
806
static bool want_bucket(struct bch_fs *c,
807
struct alloc_request *req,
808
struct open_bucket *ob)
809
{
810
struct bch_dev *ca = ob_dev(c, ob);
811
812
if (!test_bit(ob->dev, req->devs_may_alloc.d))
813
return false;
814
815
if (ob->data_type != req->wp->data_type)
816
return false;
817
818
if (!ca->mi.durability &&
819
(req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache))
820
return false;
821
822
if (req->ec != (ob->ec != NULL))
823
return false;
824
825
return true;
826
}
827
828
static int bucket_alloc_set_writepoint(struct bch_fs *c,
829
struct alloc_request *req)
830
{
831
struct open_bucket *ob;
832
unsigned i;
833
int ret = 0;
834
835
req->scratch_ptrs.nr = 0;
836
837
open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
838
if (!ret && want_bucket(c, req, ob))
839
ret = add_new_bucket(c, req, ob);
840
else
841
ob_push(c, &req->scratch_ptrs, ob);
842
}
843
req->wp->ptrs = req->scratch_ptrs;
844
845
return ret;
846
}
847
848
static int bucket_alloc_set_partial(struct bch_fs *c,
849
struct alloc_request *req)
850
{
851
int i, ret = 0;
852
853
if (!c->open_buckets_partial_nr)
854
return 0;
855
856
spin_lock(&c->freelist_lock);
857
858
if (!c->open_buckets_partial_nr)
859
goto unlock;
860
861
for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
862
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
863
864
if (want_bucket(c, req, ob)) {
865
struct bch_dev *ca = ob_dev(c, ob);
866
u64 avail;
867
868
bch2_dev_usage_read_fast(ca, &req->usage);
869
avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets;
870
if (!avail)
871
continue;
872
873
array_remove_item(c->open_buckets_partial,
874
c->open_buckets_partial_nr,
875
i);
876
ob->on_partial_list = false;
877
878
scoped_guard(rcu)
879
bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
880
881
ret = add_new_bucket(c, req, ob);
882
if (ret)
883
break;
884
}
885
}
886
unlock:
887
spin_unlock(&c->freelist_lock);
888
return ret;
889
}
890
891
static int __open_bucket_add_buckets(struct btree_trans *trans,
892
struct alloc_request *req,
893
struct closure *_cl)
894
{
895
struct bch_fs *c = trans->c;
896
struct open_bucket *ob;
897
struct closure *cl = NULL;
898
unsigned i;
899
int ret;
900
901
req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
902
903
/* Don't allocate from devices we already have pointers to: */
904
darray_for_each(*req->devs_have, i)
905
__clear_bit(*i, req->devs_may_alloc.d);
906
907
open_bucket_for_each(c, &req->ptrs, ob, i)
908
__clear_bit(ob->dev, req->devs_may_alloc.d);
909
910
ret = bucket_alloc_set_writepoint(c, req);
911
if (ret)
912
return ret;
913
914
ret = bucket_alloc_set_partial(c, req);
915
if (ret)
916
return ret;
917
918
if (req->ec) {
919
ret = bucket_alloc_from_stripe(trans, req, _cl);
920
} else {
921
retry_blocking:
922
/*
923
* Try nonblocking first, so that if one device is full we'll try from
924
* other devices:
925
*/
926
ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
927
if (ret &&
928
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
929
!bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
930
!cl && _cl) {
931
cl = _cl;
932
goto retry_blocking;
933
}
934
}
935
936
return ret;
937
}
938
939
static int open_bucket_add_buckets(struct btree_trans *trans,
940
struct alloc_request *req,
941
struct closure *cl)
942
{
943
int ret;
944
945
if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
946
ret = __open_bucket_add_buckets(trans, req, cl);
947
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
948
bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
949
bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
950
bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
951
return ret;
952
if (req->nr_effective >= req->nr_replicas)
953
return 0;
954
}
955
956
bool ec = false;
957
swap(ec, req->ec);
958
ret = __open_bucket_add_buckets(trans, req, cl);
959
swap(ec, req->ec);
960
961
return ret < 0 ? ret : 0;
962
}
963
964
/**
965
* should_drop_bucket - check if this is open_bucket should go away
966
* @ob: open_bucket to predicate on
967
* @c: filesystem handle
968
* @ca: if set, we're killing buckets for a particular device
969
* @ec: if true, we're shutting down erasure coding and killing all ec
970
* open_buckets
971
* otherwise, return true
972
* Returns: true if we should kill this open_bucket
973
*
974
* We're killing open_buckets because we're shutting down a device, erasure
975
* coding, or the entire filesystem - check if this open_bucket matches:
976
*/
977
static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
978
struct bch_dev *ca, bool ec)
979
{
980
if (ec) {
981
return ob->ec != NULL;
982
} else if (ca) {
983
bool drop = ob->dev == ca->dev_idx;
984
struct open_bucket *ob2;
985
unsigned i;
986
987
if (!drop && ob->ec) {
988
unsigned nr_blocks;
989
990
mutex_lock(&ob->ec->lock);
991
nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
992
993
for (i = 0; i < nr_blocks; i++) {
994
if (!ob->ec->blocks[i])
995
continue;
996
997
ob2 = c->open_buckets + ob->ec->blocks[i];
998
drop |= ob2->dev == ca->dev_idx;
999
}
1000
mutex_unlock(&ob->ec->lock);
1001
}
1002
1003
return drop;
1004
} else {
1005
return true;
1006
}
1007
}
1008
1009
static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1010
bool ec, struct write_point *wp)
1011
{
1012
struct open_buckets ptrs = { .nr = 0 };
1013
struct open_bucket *ob;
1014
unsigned i;
1015
1016
mutex_lock(&wp->lock);
1017
open_bucket_for_each(c, &wp->ptrs, ob, i)
1018
if (should_drop_bucket(ob, c, ca, ec))
1019
bch2_open_bucket_put(c, ob);
1020
else
1021
ob_push(c, &ptrs, ob);
1022
wp->ptrs = ptrs;
1023
mutex_unlock(&wp->lock);
1024
}
1025
1026
void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1027
bool ec)
1028
{
1029
unsigned i;
1030
1031
/* Next, close write points that point to this device... */
1032
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1033
bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1034
1035
bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1036
bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1037
bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1038
1039
mutex_lock(&c->btree_reserve_cache_lock);
1040
while (c->btree_reserve_cache_nr) {
1041
struct btree_alloc *a =
1042
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1043
1044
bch2_open_buckets_put(c, &a->ob);
1045
}
1046
mutex_unlock(&c->btree_reserve_cache_lock);
1047
1048
spin_lock(&c->freelist_lock);
1049
i = 0;
1050
while (i < c->open_buckets_partial_nr) {
1051
struct open_bucket *ob =
1052
c->open_buckets + c->open_buckets_partial[i];
1053
1054
if (should_drop_bucket(ob, c, ca, ec)) {
1055
--c->open_buckets_partial_nr;
1056
swap(c->open_buckets_partial[i],
1057
c->open_buckets_partial[c->open_buckets_partial_nr]);
1058
1059
ob->on_partial_list = false;
1060
1061
scoped_guard(rcu)
1062
bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
1063
1064
spin_unlock(&c->freelist_lock);
1065
bch2_open_bucket_put(c, ob);
1066
spin_lock(&c->freelist_lock);
1067
} else {
1068
i++;
1069
}
1070
}
1071
spin_unlock(&c->freelist_lock);
1072
1073
bch2_ec_stop_dev(c, ca);
1074
}
1075
1076
static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1077
unsigned long write_point)
1078
{
1079
unsigned hash =
1080
hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1081
1082
return &c->write_points_hash[hash];
1083
}
1084
1085
static struct write_point *__writepoint_find(struct hlist_head *head,
1086
unsigned long write_point)
1087
{
1088
struct write_point *wp;
1089
1090
guard(rcu)();
1091
hlist_for_each_entry_rcu(wp, head, node)
1092
if (wp->write_point == write_point)
1093
return wp;
1094
return NULL;
1095
}
1096
1097
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1098
{
1099
u64 stranded = c->write_points_nr * c->bucket_size_max;
1100
u64 free = bch2_fs_usage_read_short(c).free;
1101
1102
return stranded * factor > free;
1103
}
1104
1105
static noinline bool try_increase_writepoints(struct bch_fs *c)
1106
{
1107
struct write_point *wp;
1108
1109
if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1110
too_many_writepoints(c, 32))
1111
return false;
1112
1113
wp = c->write_points + c->write_points_nr++;
1114
hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1115
return true;
1116
}
1117
1118
static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1119
{
1120
struct bch_fs *c = trans->c;
1121
struct write_point *wp;
1122
struct open_bucket *ob;
1123
unsigned i;
1124
1125
mutex_lock(&c->write_points_hash_lock);
1126
if (c->write_points_nr < old_nr) {
1127
mutex_unlock(&c->write_points_hash_lock);
1128
return true;
1129
}
1130
1131
if (c->write_points_nr == 1 ||
1132
!too_many_writepoints(c, 8)) {
1133
mutex_unlock(&c->write_points_hash_lock);
1134
return false;
1135
}
1136
1137
wp = c->write_points + --c->write_points_nr;
1138
1139
hlist_del_rcu(&wp->node);
1140
mutex_unlock(&c->write_points_hash_lock);
1141
1142
bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1143
open_bucket_for_each(c, &wp->ptrs, ob, i)
1144
open_bucket_free_unused(c, ob);
1145
wp->ptrs.nr = 0;
1146
mutex_unlock(&wp->lock);
1147
return true;
1148
}
1149
1150
static struct write_point *writepoint_find(struct btree_trans *trans,
1151
unsigned long write_point)
1152
{
1153
struct bch_fs *c = trans->c;
1154
struct write_point *wp, *oldest;
1155
struct hlist_head *head;
1156
1157
if (!(write_point & 1UL)) {
1158
wp = (struct write_point *) write_point;
1159
bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1160
return wp;
1161
}
1162
1163
head = writepoint_hash(c, write_point);
1164
restart_find:
1165
wp = __writepoint_find(head, write_point);
1166
if (wp) {
1167
lock_wp:
1168
bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1169
if (wp->write_point == write_point)
1170
goto out;
1171
mutex_unlock(&wp->lock);
1172
goto restart_find;
1173
}
1174
restart_find_oldest:
1175
oldest = NULL;
1176
for (wp = c->write_points;
1177
wp < c->write_points + c->write_points_nr; wp++)
1178
if (!oldest || time_before64(wp->last_used, oldest->last_used))
1179
oldest = wp;
1180
1181
bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1182
bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1183
if (oldest >= c->write_points + c->write_points_nr ||
1184
try_increase_writepoints(c)) {
1185
mutex_unlock(&c->write_points_hash_lock);
1186
mutex_unlock(&oldest->lock);
1187
goto restart_find_oldest;
1188
}
1189
1190
wp = __writepoint_find(head, write_point);
1191
if (wp && wp != oldest) {
1192
mutex_unlock(&c->write_points_hash_lock);
1193
mutex_unlock(&oldest->lock);
1194
goto lock_wp;
1195
}
1196
1197
wp = oldest;
1198
hlist_del_rcu(&wp->node);
1199
wp->write_point = write_point;
1200
hlist_add_head_rcu(&wp->node, head);
1201
mutex_unlock(&c->write_points_hash_lock);
1202
out:
1203
wp->last_used = local_clock();
1204
return wp;
1205
}
1206
1207
static noinline void
1208
deallocate_extra_replicas(struct bch_fs *c,
1209
struct alloc_request *req)
1210
{
1211
struct open_bucket *ob;
1212
unsigned extra_replicas = req->nr_effective - req->nr_replicas;
1213
unsigned i;
1214
1215
req->scratch_ptrs.nr = 0;
1216
1217
open_bucket_for_each(c, &req->ptrs, ob, i) {
1218
unsigned d = ob_dev(c, ob)->mi.durability;
1219
1220
if (d && d <= extra_replicas) {
1221
extra_replicas -= d;
1222
ob_push(c, &req->wp->ptrs, ob);
1223
} else {
1224
ob_push(c, &req->scratch_ptrs, ob);
1225
}
1226
}
1227
1228
req->ptrs = req->scratch_ptrs;
1229
}
1230
1231
/*
1232
* Get us an open_bucket we can allocate from, return with it locked:
1233
*/
1234
int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1235
unsigned target,
1236
unsigned erasure_code,
1237
struct write_point_specifier write_point,
1238
struct bch_devs_list *devs_have,
1239
unsigned nr_replicas,
1240
unsigned nr_replicas_required,
1241
enum bch_watermark watermark,
1242
enum bch_write_flags flags,
1243
struct closure *cl,
1244
struct write_point **wp_ret)
1245
{
1246
struct bch_fs *c = trans->c;
1247
struct open_bucket *ob;
1248
unsigned write_points_nr;
1249
int i;
1250
1251
struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req));
1252
int ret = PTR_ERR_OR_ZERO(req);
1253
if (unlikely(ret))
1254
return ret;
1255
1256
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1257
erasure_code = false;
1258
1259
req->nr_replicas = nr_replicas;
1260
req->target = target;
1261
req->ec = erasure_code;
1262
req->watermark = watermark;
1263
req->flags = flags;
1264
req->devs_have = devs_have;
1265
1266
BUG_ON(!nr_replicas || !nr_replicas_required);
1267
retry:
1268
req->ptrs.nr = 0;
1269
req->nr_effective = 0;
1270
req->have_cache = false;
1271
write_points_nr = c->write_points_nr;
1272
1273
*wp_ret = req->wp = writepoint_find(trans, write_point.v);
1274
1275
req->data_type = req->wp->data_type;
1276
1277
ret = bch2_trans_relock(trans);
1278
if (ret)
1279
goto err;
1280
1281
/* metadata may not allocate on cache devices: */
1282
if (req->data_type != BCH_DATA_user)
1283
req->have_cache = true;
1284
1285
if (target && !(flags & BCH_WRITE_only_specified_devs)) {
1286
ret = open_bucket_add_buckets(trans, req, NULL);
1287
if (!ret ||
1288
bch2_err_matches(ret, BCH_ERR_transaction_restart))
1289
goto alloc_done;
1290
1291
/* Don't retry from all devices if we're out of open buckets: */
1292
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
1293
int ret2 = open_bucket_add_buckets(trans, req, cl);
1294
if (!ret2 ||
1295
bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
1296
bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
1297
ret = ret2;
1298
goto alloc_done;
1299
}
1300
}
1301
1302
/*
1303
* Only try to allocate cache (durability = 0 devices) from the
1304
* specified target:
1305
*/
1306
req->have_cache = true;
1307
req->target = 0;
1308
1309
ret = open_bucket_add_buckets(trans, req, cl);
1310
} else {
1311
ret = open_bucket_add_buckets(trans, req, cl);
1312
}
1313
alloc_done:
1314
BUG_ON(!ret && req->nr_effective < req->nr_replicas);
1315
1316
if (erasure_code && !ec_open_bucket(c, &req->ptrs))
1317
pr_debug("failed to get ec bucket: ret %u", ret);
1318
1319
if (ret == -BCH_ERR_insufficient_devices &&
1320
req->nr_effective >= nr_replicas_required)
1321
ret = 0;
1322
1323
if (ret)
1324
goto err;
1325
1326
if (req->nr_effective > req->nr_replicas)
1327
deallocate_extra_replicas(c, req);
1328
1329
/* Free buckets we didn't use: */
1330
open_bucket_for_each(c, &req->wp->ptrs, ob, i)
1331
open_bucket_free_unused(c, ob);
1332
1333
req->wp->ptrs = req->ptrs;
1334
1335
req->wp->sectors_free = UINT_MAX;
1336
1337
open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
1338
/*
1339
* Ensure proper write alignment - either due to misaligned
1340
* bucket sizes (from buggy bcachefs-tools), or writes that mix
1341
* logical/physical alignment:
1342
*/
1343
struct bch_dev *ca = ob_dev(c, ob);
1344
u64 offset = bucket_to_sector(ca, ob->bucket) +
1345
ca->mi.bucket_size -
1346
ob->sectors_free;
1347
unsigned align = round_up(offset, block_sectors(c)) - offset;
1348
1349
ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
1350
1351
req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
1352
}
1353
1354
req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
1355
1356
/* Did alignment use up space in an open_bucket? */
1357
if (unlikely(!req->wp->sectors_free)) {
1358
bch2_alloc_sectors_done(c, req->wp);
1359
goto retry;
1360
}
1361
1362
BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
1363
1364
return 0;
1365
err:
1366
open_bucket_for_each(c, &req->wp->ptrs, ob, i)
1367
if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v))
1368
ob_push(c, &req->ptrs, ob);
1369
else
1370
open_bucket_free_unused(c, ob);
1371
req->wp->ptrs = req->ptrs;
1372
1373
mutex_unlock(&req->wp->lock);
1374
1375
if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1376
try_decrease_writepoints(trans, write_points_nr))
1377
goto retry;
1378
1379
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1380
ret = bch_err_throw(c, bucket_alloc_blocked);
1381
1382
if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
1383
bch2_err_matches(ret, BCH_ERR_freelist_empty))
1384
ret = bch_err_throw(c, bucket_alloc_blocked);
1385
1386
return ret;
1387
}
1388
1389
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1390
struct bkey_i *k, unsigned sectors,
1391
bool cached)
1392
{
1393
bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1394
}
1395
1396
/*
1397
* Append pointers to the space we just allocated to @k, and mark @sectors space
1398
* as allocated out of @ob
1399
*/
1400
void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1401
{
1402
bch2_alloc_sectors_done_inlined(c, wp);
1403
}
1404
1405
static inline void writepoint_init(struct write_point *wp,
1406
enum bch_data_type type)
1407
{
1408
mutex_init(&wp->lock);
1409
wp->data_type = type;
1410
1411
INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1412
INIT_LIST_HEAD(&wp->writes);
1413
spin_lock_init(&wp->writes_lock);
1414
}
1415
1416
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1417
{
1418
struct open_bucket *ob;
1419
struct write_point *wp;
1420
1421
mutex_init(&c->write_points_hash_lock);
1422
c->write_points_nr = ARRAY_SIZE(c->write_points);
1423
1424
/* open bucket 0 is a sentinal NULL: */
1425
spin_lock_init(&c->open_buckets[0].lock);
1426
1427
for (ob = c->open_buckets + 1;
1428
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1429
spin_lock_init(&ob->lock);
1430
c->open_buckets_nr_free++;
1431
1432
ob->freelist = c->open_buckets_freelist;
1433
c->open_buckets_freelist = ob - c->open_buckets;
1434
}
1435
1436
writepoint_init(&c->btree_write_point, BCH_DATA_btree);
1437
writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
1438
writepoint_init(&c->copygc_write_point, BCH_DATA_user);
1439
1440
for (wp = c->write_points;
1441
wp < c->write_points + c->write_points_nr; wp++) {
1442
writepoint_init(wp, BCH_DATA_user);
1443
1444
wp->last_used = local_clock();
1445
wp->write_point = (unsigned long) wp;
1446
hlist_add_head_rcu(&wp->node,
1447
writepoint_hash(c, wp->write_point));
1448
}
1449
}
1450
1451
void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1452
{
1453
struct bch_dev *ca = ob_dev(c, ob);
1454
unsigned data_type = ob->data_type;
1455
barrier(); /* READ_ONCE() doesn't work on bitfields */
1456
1457
prt_printf(out, "%zu ref %u ",
1458
ob - c->open_buckets,
1459
atomic_read(&ob->pin));
1460
bch2_prt_data_type(out, data_type);
1461
prt_printf(out, " %u:%llu gen %u allocated %u/%u",
1462
ob->dev, ob->bucket, ob->gen,
1463
ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1464
if (ob->ec)
1465
prt_printf(out, " ec idx %llu", ob->ec->idx);
1466
if (ob->on_partial_list)
1467
prt_str(out, " partial");
1468
prt_newline(out);
1469
}
1470
1471
void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
1472
struct bch_dev *ca)
1473
{
1474
struct open_bucket *ob;
1475
1476
out->atomic++;
1477
1478
for (ob = c->open_buckets;
1479
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1480
ob++) {
1481
spin_lock(&ob->lock);
1482
if (ob->valid && (!ca || ob->dev == ca->dev_idx))
1483
bch2_open_bucket_to_text(out, c, ob);
1484
spin_unlock(&ob->lock);
1485
}
1486
1487
--out->atomic;
1488
}
1489
1490
void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1491
{
1492
unsigned i;
1493
1494
out->atomic++;
1495
spin_lock(&c->freelist_lock);
1496
1497
for (i = 0; i < c->open_buckets_partial_nr; i++)
1498
bch2_open_bucket_to_text(out, c,
1499
c->open_buckets + c->open_buckets_partial[i]);
1500
1501
spin_unlock(&c->freelist_lock);
1502
--out->atomic;
1503
}
1504
1505
static const char * const bch2_write_point_states[] = {
1506
#define x(n) #n,
1507
WRITE_POINT_STATES()
1508
#undef x
1509
NULL
1510
};
1511
1512
static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1513
struct write_point *wp)
1514
{
1515
struct open_bucket *ob;
1516
unsigned i;
1517
1518
mutex_lock(&wp->lock);
1519
1520
prt_printf(out, "%lu: ", wp->write_point);
1521
prt_human_readable_u64(out, wp->sectors_allocated << 9);
1522
1523
prt_printf(out, " last wrote: ");
1524
bch2_pr_time_units(out, sched_clock() - wp->last_used);
1525
1526
for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1527
prt_printf(out, " %s: ", bch2_write_point_states[i]);
1528
bch2_pr_time_units(out, wp->time[i]);
1529
}
1530
1531
prt_newline(out);
1532
1533
printbuf_indent_add(out, 2);
1534
open_bucket_for_each(c, &wp->ptrs, ob, i)
1535
bch2_open_bucket_to_text(out, c, ob);
1536
printbuf_indent_sub(out, 2);
1537
1538
mutex_unlock(&wp->lock);
1539
}
1540
1541
void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1542
{
1543
struct write_point *wp;
1544
1545
prt_str(out, "Foreground write points\n");
1546
for (wp = c->write_points;
1547
wp < c->write_points + ARRAY_SIZE(c->write_points);
1548
wp++)
1549
bch2_write_point_to_text(out, c, wp);
1550
1551
prt_str(out, "Copygc write point\n");
1552
bch2_write_point_to_text(out, c, &c->copygc_write_point);
1553
1554
prt_str(out, "Rebalance write point\n");
1555
bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1556
1557
prt_str(out, "Btree write point\n");
1558
bch2_write_point_to_text(out, c, &c->btree_write_point);
1559
}
1560
1561
void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
1562
{
1563
unsigned nr[BCH_DATA_NR];
1564
1565
memset(nr, 0, sizeof(nr));
1566
1567
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
1568
nr[c->open_buckets[i].data_type]++;
1569
1570
printbuf_tabstops_reset(out);
1571
printbuf_tabstop_push(out, 24);
1572
1573
prt_printf(out, "capacity\t%llu\n", c->capacity);
1574
prt_printf(out, "reserved\t%llu\n", c->reserved);
1575
prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
1576
prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
1577
prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data));
1578
prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached));
1579
prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved));
1580
prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
1581
prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes));
1582
1583
prt_newline(out);
1584
prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
1585
prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
1586
prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT);
1587
prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty");
1588
prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]);
1589
prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]);
1590
prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr);
1591
}
1592
1593
void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
1594
{
1595
struct bch_fs *c = ca->fs;
1596
struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
1597
unsigned nr[BCH_DATA_NR];
1598
1599
memset(nr, 0, sizeof(nr));
1600
1601
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
1602
nr[c->open_buckets[i].data_type]++;
1603
1604
bch2_dev_usage_to_text(out, ca, &stats);
1605
1606
prt_newline(out);
1607
1608
prt_printf(out, "reserves:\n");
1609
for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
1610
prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
1611
1612
prt_newline(out);
1613
1614
printbuf_tabstops_reset(out);
1615
printbuf_tabstop_push(out, 12);
1616
printbuf_tabstop_push(out, 16);
1617
1618
prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
1619
prt_printf(out, "buckets to invalidate\t%llu\r\n",
1620
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
1621
}
1622
1623
static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
1624
{
1625
struct printbuf buf = PRINTBUF;
1626
1627
prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
1628
c->opts.allocator_stuck_timeout);
1629
1630
prt_printf(&buf, "Allocator debug:\n");
1631
printbuf_indent_add(&buf, 2);
1632
bch2_fs_alloc_debug_to_text(&buf, c);
1633
printbuf_indent_sub(&buf, 2);
1634
prt_newline(&buf);
1635
1636
bch2_printbuf_make_room(&buf, 4096);
1637
1638
buf.atomic++;
1639
scoped_guard(rcu)
1640
for_each_online_member_rcu(c, ca) {
1641
prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
1642
printbuf_indent_add(&buf, 2);
1643
bch2_dev_alloc_debug_to_text(&buf, ca);
1644
printbuf_indent_sub(&buf, 2);
1645
prt_newline(&buf);
1646
}
1647
--buf.atomic;
1648
1649
prt_printf(&buf, "Copygc debug:\n");
1650
printbuf_indent_add(&buf, 2);
1651
bch2_copygc_wait_to_text(&buf, c);
1652
printbuf_indent_sub(&buf, 2);
1653
prt_newline(&buf);
1654
1655
prt_printf(&buf, "Journal debug:\n");
1656
printbuf_indent_add(&buf, 2);
1657
bch2_journal_debug_to_text(&buf, &c->journal);
1658
printbuf_indent_sub(&buf, 2);
1659
1660
bch2_print_str(c, KERN_ERR, buf.buf);
1661
printbuf_exit(&buf);
1662
}
1663
1664
static inline unsigned allocator_wait_timeout(struct bch_fs *c)
1665
{
1666
if (c->allocator_last_stuck &&
1667
time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
1668
return 0;
1669
1670
return c->opts.allocator_stuck_timeout * HZ;
1671
}
1672
1673
void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
1674
{
1675
unsigned t = allocator_wait_timeout(c);
1676
1677
if (t && closure_sync_timeout(cl, t)) {
1678
c->allocator_last_stuck = jiffies;
1679
bch2_print_allocator_stuck(c);
1680
}
1681
1682
closure_sync(cl);
1683
}
1684
1685