Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/delayed-inode.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2011 Fujitsu. All rights reserved.
4
* Written by Miao Xie <[email protected]>
5
*/
6
7
#include <linux/slab.h>
8
#include <linux/iversion.h>
9
#include "ctree.h"
10
#include "fs.h"
11
#include "messages.h"
12
#include "misc.h"
13
#include "delayed-inode.h"
14
#include "disk-io.h"
15
#include "transaction.h"
16
#include "qgroup.h"
17
#include "locking.h"
18
#include "inode-item.h"
19
#include "space-info.h"
20
#include "accessors.h"
21
#include "file-item.h"
22
23
#define BTRFS_DELAYED_WRITEBACK 512
24
#define BTRFS_DELAYED_BACKGROUND 128
25
#define BTRFS_DELAYED_BATCH 16
26
27
static struct kmem_cache *delayed_node_cache;
28
29
int __init btrfs_delayed_inode_init(void)
30
{
31
delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0);
32
if (!delayed_node_cache)
33
return -ENOMEM;
34
return 0;
35
}
36
37
void __cold btrfs_delayed_inode_exit(void)
38
{
39
kmem_cache_destroy(delayed_node_cache);
40
}
41
42
void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root)
43
{
44
atomic_set(&delayed_root->items, 0);
45
atomic_set(&delayed_root->items_seq, 0);
46
delayed_root->nodes = 0;
47
spin_lock_init(&delayed_root->lock);
48
init_waitqueue_head(&delayed_root->wait);
49
INIT_LIST_HEAD(&delayed_root->node_list);
50
INIT_LIST_HEAD(&delayed_root->prepare_list);
51
}
52
53
static inline void btrfs_init_delayed_node(
54
struct btrfs_delayed_node *delayed_node,
55
struct btrfs_root *root, u64 inode_id)
56
{
57
delayed_node->root = root;
58
delayed_node->inode_id = inode_id;
59
refcount_set(&delayed_node->refs, 0);
60
delayed_node->ins_root = RB_ROOT_CACHED;
61
delayed_node->del_root = RB_ROOT_CACHED;
62
mutex_init(&delayed_node->mutex);
63
INIT_LIST_HEAD(&delayed_node->n_list);
64
INIT_LIST_HEAD(&delayed_node->p_list);
65
}
66
67
static struct btrfs_delayed_node *btrfs_get_delayed_node(
68
struct btrfs_inode *btrfs_inode)
69
{
70
struct btrfs_root *root = btrfs_inode->root;
71
u64 ino = btrfs_ino(btrfs_inode);
72
struct btrfs_delayed_node *node;
73
74
node = READ_ONCE(btrfs_inode->delayed_node);
75
if (node) {
76
refcount_inc(&node->refs);
77
return node;
78
}
79
80
xa_lock(&root->delayed_nodes);
81
node = xa_load(&root->delayed_nodes, ino);
82
83
if (node) {
84
if (btrfs_inode->delayed_node) {
85
refcount_inc(&node->refs); /* can be accessed */
86
BUG_ON(btrfs_inode->delayed_node != node);
87
xa_unlock(&root->delayed_nodes);
88
return node;
89
}
90
91
/*
92
* It's possible that we're racing into the middle of removing
93
* this node from the xarray. In this case, the refcount
94
* was zero and it should never go back to one. Just return
95
* NULL like it was never in the xarray at all; our release
96
* function is in the process of removing it.
97
*
98
* Some implementations of refcount_inc refuse to bump the
99
* refcount once it has hit zero. If we don't do this dance
100
* here, refcount_inc() may decide to just WARN_ONCE() instead
101
* of actually bumping the refcount.
102
*
103
* If this node is properly in the xarray, we want to bump the
104
* refcount twice, once for the inode and once for this get
105
* operation.
106
*/
107
if (refcount_inc_not_zero(&node->refs)) {
108
refcount_inc(&node->refs);
109
btrfs_inode->delayed_node = node;
110
} else {
111
node = NULL;
112
}
113
114
xa_unlock(&root->delayed_nodes);
115
return node;
116
}
117
xa_unlock(&root->delayed_nodes);
118
119
return NULL;
120
}
121
122
/*
123
* Look up an existing delayed node associated with @btrfs_inode or create a new
124
* one and insert it to the delayed nodes of the root.
125
*
126
* Return the delayed node, or error pointer on failure.
127
*/
128
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
129
struct btrfs_inode *btrfs_inode)
130
{
131
struct btrfs_delayed_node *node;
132
struct btrfs_root *root = btrfs_inode->root;
133
u64 ino = btrfs_ino(btrfs_inode);
134
int ret;
135
void *ptr;
136
137
again:
138
node = btrfs_get_delayed_node(btrfs_inode);
139
if (node)
140
return node;
141
142
node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
143
if (!node)
144
return ERR_PTR(-ENOMEM);
145
btrfs_init_delayed_node(node, root, ino);
146
147
/* Cached in the inode and can be accessed. */
148
refcount_set(&node->refs, 2);
149
150
/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
151
ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
152
if (ret == -ENOMEM) {
153
kmem_cache_free(delayed_node_cache, node);
154
return ERR_PTR(-ENOMEM);
155
}
156
xa_lock(&root->delayed_nodes);
157
ptr = xa_load(&root->delayed_nodes, ino);
158
if (ptr) {
159
/* Somebody inserted it, go back and read it. */
160
xa_unlock(&root->delayed_nodes);
161
kmem_cache_free(delayed_node_cache, node);
162
node = NULL;
163
goto again;
164
}
165
ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
166
ASSERT(xa_err(ptr) != -EINVAL);
167
ASSERT(xa_err(ptr) != -ENOMEM);
168
ASSERT(ptr == NULL);
169
btrfs_inode->delayed_node = node;
170
xa_unlock(&root->delayed_nodes);
171
172
return node;
173
}
174
175
/*
176
* Call it when holding delayed_node->mutex
177
*
178
* If mod = 1, add this node into the prepared list.
179
*/
180
static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
181
struct btrfs_delayed_node *node,
182
int mod)
183
{
184
spin_lock(&root->lock);
185
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
186
if (!list_empty(&node->p_list))
187
list_move_tail(&node->p_list, &root->prepare_list);
188
else if (mod)
189
list_add_tail(&node->p_list, &root->prepare_list);
190
} else {
191
list_add_tail(&node->n_list, &root->node_list);
192
list_add_tail(&node->p_list, &root->prepare_list);
193
refcount_inc(&node->refs); /* inserted into list */
194
root->nodes++;
195
set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
196
}
197
spin_unlock(&root->lock);
198
}
199
200
/* Call it when holding delayed_node->mutex */
201
static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
202
struct btrfs_delayed_node *node)
203
{
204
spin_lock(&root->lock);
205
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
206
root->nodes--;
207
refcount_dec(&node->refs); /* not in the list */
208
list_del_init(&node->n_list);
209
if (!list_empty(&node->p_list))
210
list_del_init(&node->p_list);
211
clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
212
}
213
spin_unlock(&root->lock);
214
}
215
216
static struct btrfs_delayed_node *btrfs_first_delayed_node(
217
struct btrfs_delayed_root *delayed_root)
218
{
219
struct btrfs_delayed_node *node;
220
221
spin_lock(&delayed_root->lock);
222
node = list_first_entry_or_null(&delayed_root->node_list,
223
struct btrfs_delayed_node, n_list);
224
if (node)
225
refcount_inc(&node->refs);
226
spin_unlock(&delayed_root->lock);
227
228
return node;
229
}
230
231
static struct btrfs_delayed_node *btrfs_next_delayed_node(
232
struct btrfs_delayed_node *node)
233
{
234
struct btrfs_delayed_root *delayed_root;
235
struct list_head *p;
236
struct btrfs_delayed_node *next = NULL;
237
238
delayed_root = node->root->fs_info->delayed_root;
239
spin_lock(&delayed_root->lock);
240
if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
241
/* not in the list */
242
if (list_empty(&delayed_root->node_list))
243
goto out;
244
p = delayed_root->node_list.next;
245
} else if (list_is_last(&node->n_list, &delayed_root->node_list))
246
goto out;
247
else
248
p = node->n_list.next;
249
250
next = list_entry(p, struct btrfs_delayed_node, n_list);
251
refcount_inc(&next->refs);
252
out:
253
spin_unlock(&delayed_root->lock);
254
255
return next;
256
}
257
258
static void __btrfs_release_delayed_node(
259
struct btrfs_delayed_node *delayed_node,
260
int mod)
261
{
262
struct btrfs_delayed_root *delayed_root;
263
264
if (!delayed_node)
265
return;
266
267
delayed_root = delayed_node->root->fs_info->delayed_root;
268
269
mutex_lock(&delayed_node->mutex);
270
if (delayed_node->count)
271
btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
272
else
273
btrfs_dequeue_delayed_node(delayed_root, delayed_node);
274
mutex_unlock(&delayed_node->mutex);
275
276
if (refcount_dec_and_test(&delayed_node->refs)) {
277
struct btrfs_root *root = delayed_node->root;
278
279
xa_erase(&root->delayed_nodes, delayed_node->inode_id);
280
/*
281
* Once our refcount goes to zero, nobody is allowed to bump it
282
* back up. We can delete it now.
283
*/
284
ASSERT(refcount_read(&delayed_node->refs) == 0);
285
kmem_cache_free(delayed_node_cache, delayed_node);
286
}
287
}
288
289
static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
290
{
291
__btrfs_release_delayed_node(node, 0);
292
}
293
294
static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
295
struct btrfs_delayed_root *delayed_root)
296
{
297
struct btrfs_delayed_node *node;
298
299
spin_lock(&delayed_root->lock);
300
node = list_first_entry_or_null(&delayed_root->prepare_list,
301
struct btrfs_delayed_node, p_list);
302
if (node) {
303
list_del_init(&node->p_list);
304
refcount_inc(&node->refs);
305
}
306
spin_unlock(&delayed_root->lock);
307
308
return node;
309
}
310
311
static inline void btrfs_release_prepared_delayed_node(
312
struct btrfs_delayed_node *node)
313
{
314
__btrfs_release_delayed_node(node, 1);
315
}
316
317
static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
318
struct btrfs_delayed_node *node,
319
enum btrfs_delayed_item_type type)
320
{
321
struct btrfs_delayed_item *item;
322
323
item = kmalloc(struct_size(item, data, data_len), GFP_NOFS);
324
if (item) {
325
item->data_len = data_len;
326
item->type = type;
327
item->bytes_reserved = 0;
328
item->delayed_node = node;
329
RB_CLEAR_NODE(&item->rb_node);
330
INIT_LIST_HEAD(&item->log_list);
331
item->logged = false;
332
refcount_set(&item->refs, 1);
333
}
334
return item;
335
}
336
337
static int delayed_item_index_cmp(const void *key, const struct rb_node *node)
338
{
339
const u64 *index = key;
340
const struct btrfs_delayed_item *delayed_item = rb_entry(node,
341
struct btrfs_delayed_item, rb_node);
342
343
if (delayed_item->index < *index)
344
return 1;
345
else if (delayed_item->index > *index)
346
return -1;
347
348
return 0;
349
}
350
351
/*
352
* Look up the delayed item by key.
353
*
354
* @delayed_node: pointer to the delayed node
355
* @index: the dir index value to lookup (offset of a dir index key)
356
*
357
* Note: if we don't find the right item, we will return the prev item and
358
* the next item.
359
*/
360
static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
361
struct rb_root *root,
362
u64 index)
363
{
364
struct rb_node *node;
365
366
node = rb_find(&index, root, delayed_item_index_cmp);
367
return rb_entry_safe(node, struct btrfs_delayed_item, rb_node);
368
}
369
370
static int btrfs_delayed_item_cmp(const struct rb_node *new,
371
const struct rb_node *exist)
372
{
373
const struct btrfs_delayed_item *new_item =
374
rb_entry(new, struct btrfs_delayed_item, rb_node);
375
376
return delayed_item_index_cmp(&new_item->index, exist);
377
}
378
379
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
380
struct btrfs_delayed_item *ins)
381
{
382
struct rb_root_cached *root;
383
struct rb_node *exist;
384
385
if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
386
root = &delayed_node->ins_root;
387
else
388
root = &delayed_node->del_root;
389
390
exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp);
391
if (exist)
392
return -EEXIST;
393
394
if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
395
ins->index >= delayed_node->index_cnt)
396
delayed_node->index_cnt = ins->index + 1;
397
398
delayed_node->count++;
399
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
400
return 0;
401
}
402
403
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
404
{
405
int seq = atomic_inc_return(&delayed_root->items_seq);
406
407
/* atomic_dec_return implies a barrier */
408
if ((atomic_dec_return(&delayed_root->items) <
409
BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
410
cond_wake_up_nomb(&delayed_root->wait);
411
}
412
413
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
414
{
415
struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node;
416
struct rb_root_cached *root;
417
struct btrfs_delayed_root *delayed_root;
418
419
/* Not inserted, ignore it. */
420
if (RB_EMPTY_NODE(&delayed_item->rb_node))
421
return;
422
423
/* If it's in a rbtree, then we need to have delayed node locked. */
424
lockdep_assert_held(&delayed_node->mutex);
425
426
delayed_root = delayed_node->root->fs_info->delayed_root;
427
428
if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
429
root = &delayed_node->ins_root;
430
else
431
root = &delayed_node->del_root;
432
433
rb_erase_cached(&delayed_item->rb_node, root);
434
RB_CLEAR_NODE(&delayed_item->rb_node);
435
delayed_node->count--;
436
437
finish_one_item(delayed_root);
438
}
439
440
static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
441
{
442
if (item) {
443
__btrfs_remove_delayed_item(item);
444
if (refcount_dec_and_test(&item->refs))
445
kfree(item);
446
}
447
}
448
449
static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
450
struct btrfs_delayed_node *delayed_node)
451
{
452
struct rb_node *p = rb_first_cached(&delayed_node->ins_root);
453
454
return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
455
}
456
457
static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
458
struct btrfs_delayed_node *delayed_node)
459
{
460
struct rb_node *p = rb_first_cached(&delayed_node->del_root);
461
462
return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
463
}
464
465
static struct btrfs_delayed_item *__btrfs_next_delayed_item(
466
struct btrfs_delayed_item *item)
467
{
468
struct rb_node *p = rb_next(&item->rb_node);
469
470
return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
471
}
472
473
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
474
struct btrfs_delayed_item *item)
475
{
476
struct btrfs_block_rsv *src_rsv;
477
struct btrfs_block_rsv *dst_rsv;
478
struct btrfs_fs_info *fs_info = trans->fs_info;
479
u64 num_bytes;
480
int ret;
481
482
if (!trans->bytes_reserved)
483
return 0;
484
485
src_rsv = trans->block_rsv;
486
dst_rsv = &fs_info->delayed_block_rsv;
487
488
num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
489
490
/*
491
* Here we migrate space rsv from transaction rsv, since have already
492
* reserved space when starting a transaction. So no need to reserve
493
* qgroup space here.
494
*/
495
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
496
if (!ret) {
497
trace_btrfs_space_reservation(fs_info, "delayed_item",
498
item->delayed_node->inode_id,
499
num_bytes, 1);
500
/*
501
* For insertions we track reserved metadata space by accounting
502
* for the number of leaves that will be used, based on the delayed
503
* node's curr_index_batch_size and index_item_leaves fields.
504
*/
505
if (item->type == BTRFS_DELAYED_DELETION_ITEM)
506
item->bytes_reserved = num_bytes;
507
}
508
509
return ret;
510
}
511
512
static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
513
struct btrfs_delayed_item *item)
514
{
515
struct btrfs_block_rsv *rsv;
516
struct btrfs_fs_info *fs_info = root->fs_info;
517
518
if (!item->bytes_reserved)
519
return;
520
521
rsv = &fs_info->delayed_block_rsv;
522
/*
523
* Check btrfs_delayed_item_reserve_metadata() to see why we don't need
524
* to release/reserve qgroup space.
525
*/
526
trace_btrfs_space_reservation(fs_info, "delayed_item",
527
item->delayed_node->inode_id,
528
item->bytes_reserved, 0);
529
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
530
}
531
532
static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
533
unsigned int num_leaves)
534
{
535
struct btrfs_fs_info *fs_info = node->root->fs_info;
536
const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
537
538
/* There are no space reservations during log replay, bail out. */
539
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
540
return;
541
542
trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
543
bytes, 0);
544
btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
545
}
546
547
static int btrfs_delayed_inode_reserve_metadata(
548
struct btrfs_trans_handle *trans,
549
struct btrfs_root *root,
550
struct btrfs_delayed_node *node)
551
{
552
struct btrfs_fs_info *fs_info = root->fs_info;
553
struct btrfs_block_rsv *src_rsv;
554
struct btrfs_block_rsv *dst_rsv;
555
u64 num_bytes;
556
int ret;
557
558
src_rsv = trans->block_rsv;
559
dst_rsv = &fs_info->delayed_block_rsv;
560
561
num_bytes = btrfs_calc_metadata_size(fs_info, 1);
562
563
/*
564
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
565
* which doesn't reserve space for speed. This is a problem since we
566
* still need to reserve space for this update, so try to reserve the
567
* space.
568
*
569
* Now if src_rsv == delalloc_block_rsv we'll let it just steal since
570
* we always reserve enough to update the inode item.
571
*/
572
if (!src_rsv || (!trans->bytes_reserved &&
573
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
574
ret = btrfs_qgroup_reserve_meta(root, num_bytes,
575
BTRFS_QGROUP_RSV_META_PREALLOC, true);
576
if (ret < 0)
577
return ret;
578
ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
579
BTRFS_RESERVE_NO_FLUSH);
580
/* NO_FLUSH could only fail with -ENOSPC */
581
ASSERT(ret == 0 || ret == -ENOSPC);
582
if (ret)
583
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
584
} else {
585
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
586
}
587
588
if (!ret) {
589
trace_btrfs_space_reservation(fs_info, "delayed_inode",
590
node->inode_id, num_bytes, 1);
591
node->bytes_reserved = num_bytes;
592
}
593
594
return ret;
595
}
596
597
static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
598
struct btrfs_delayed_node *node,
599
bool qgroup_free)
600
{
601
struct btrfs_block_rsv *rsv;
602
603
if (!node->bytes_reserved)
604
return;
605
606
rsv = &fs_info->delayed_block_rsv;
607
trace_btrfs_space_reservation(fs_info, "delayed_inode",
608
node->inode_id, node->bytes_reserved, 0);
609
btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
610
if (qgroup_free)
611
btrfs_qgroup_free_meta_prealloc(node->root,
612
node->bytes_reserved);
613
else
614
btrfs_qgroup_convert_reserved_meta(node->root,
615
node->bytes_reserved);
616
node->bytes_reserved = 0;
617
}
618
619
/*
620
* Insert a single delayed item or a batch of delayed items, as many as possible
621
* that fit in a leaf. The delayed items (dir index keys) are sorted by their key
622
* in the rbtree, and if there's a gap between two consecutive dir index items,
623
* then it means at some point we had delayed dir indexes to add but they got
624
* removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
625
* into the subvolume tree. Dir index keys also have their offsets coming from a
626
* monotonically increasing counter, so we can't get new keys with an offset that
627
* fits within a gap between delayed dir index items.
628
*/
629
static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
630
struct btrfs_root *root,
631
struct btrfs_path *path,
632
struct btrfs_delayed_item *first_item)
633
{
634
struct btrfs_fs_info *fs_info = root->fs_info;
635
struct btrfs_delayed_node *node = first_item->delayed_node;
636
LIST_HEAD(item_list);
637
struct btrfs_delayed_item *curr;
638
struct btrfs_delayed_item *next;
639
const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
640
struct btrfs_item_batch batch;
641
struct btrfs_key first_key;
642
const u32 first_data_size = first_item->data_len;
643
int total_size;
644
char *ins_data = NULL;
645
int ret;
646
bool continuous_keys_only = false;
647
648
lockdep_assert_held(&node->mutex);
649
650
/*
651
* During normal operation the delayed index offset is continuously
652
* increasing, so we can batch insert all items as there will not be any
653
* overlapping keys in the tree.
654
*
655
* The exception to this is log replay, where we may have interleaved
656
* offsets in the tree, so our batch needs to be continuous keys only in
657
* order to ensure we do not end up with out of order items in our leaf.
658
*/
659
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
660
continuous_keys_only = true;
661
662
/*
663
* For delayed items to insert, we track reserved metadata bytes based
664
* on the number of leaves that we will use.
665
* See btrfs_insert_delayed_dir_index() and
666
* btrfs_delayed_item_reserve_metadata()).
667
*/
668
ASSERT(first_item->bytes_reserved == 0);
669
670
list_add_tail(&first_item->tree_list, &item_list);
671
batch.total_data_size = first_data_size;
672
batch.nr = 1;
673
total_size = first_data_size + sizeof(struct btrfs_item);
674
curr = first_item;
675
676
while (true) {
677
int next_size;
678
679
next = __btrfs_next_delayed_item(curr);
680
if (!next)
681
break;
682
683
/*
684
* We cannot allow gaps in the key space if we're doing log
685
* replay.
686
*/
687
if (continuous_keys_only && (next->index != curr->index + 1))
688
break;
689
690
ASSERT(next->bytes_reserved == 0);
691
692
next_size = next->data_len + sizeof(struct btrfs_item);
693
if (total_size + next_size > max_size)
694
break;
695
696
list_add_tail(&next->tree_list, &item_list);
697
batch.nr++;
698
total_size += next_size;
699
batch.total_data_size += next->data_len;
700
curr = next;
701
}
702
703
if (batch.nr == 1) {
704
first_key.objectid = node->inode_id;
705
first_key.type = BTRFS_DIR_INDEX_KEY;
706
first_key.offset = first_item->index;
707
batch.keys = &first_key;
708
batch.data_sizes = &first_data_size;
709
} else {
710
struct btrfs_key *ins_keys;
711
u32 *ins_sizes;
712
int i = 0;
713
714
ins_data = kmalloc(batch.nr * sizeof(u32) +
715
batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
716
if (!ins_data) {
717
ret = -ENOMEM;
718
goto out;
719
}
720
ins_sizes = (u32 *)ins_data;
721
ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
722
batch.keys = ins_keys;
723
batch.data_sizes = ins_sizes;
724
list_for_each_entry(curr, &item_list, tree_list) {
725
ins_keys[i].objectid = node->inode_id;
726
ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
727
ins_keys[i].offset = curr->index;
728
ins_sizes[i] = curr->data_len;
729
i++;
730
}
731
}
732
733
ret = btrfs_insert_empty_items(trans, root, path, &batch);
734
if (ret)
735
goto out;
736
737
list_for_each_entry(curr, &item_list, tree_list) {
738
char *data_ptr;
739
740
data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
741
write_extent_buffer(path->nodes[0], &curr->data,
742
(unsigned long)data_ptr, curr->data_len);
743
path->slots[0]++;
744
}
745
746
/*
747
* Now release our path before releasing the delayed items and their
748
* metadata reservations, so that we don't block other tasks for more
749
* time than needed.
750
*/
751
btrfs_release_path(path);
752
753
ASSERT(node->index_item_leaves > 0);
754
755
/*
756
* For normal operations we will batch an entire leaf's worth of delayed
757
* items, so if there are more items to process we can decrement
758
* index_item_leaves by 1 as we inserted 1 leaf's worth of items.
759
*
760
* However for log replay we may not have inserted an entire leaf's
761
* worth of items, we may have not had continuous items, so decrementing
762
* here would mess up the index_item_leaves accounting. For this case
763
* only clean up the accounting when there are no items left.
764
*/
765
if (next && !continuous_keys_only) {
766
/*
767
* We inserted one batch of items into a leaf a there are more
768
* items to flush in a future batch, now release one unit of
769
* metadata space from the delayed block reserve, corresponding
770
* the leaf we just flushed to.
771
*/
772
btrfs_delayed_item_release_leaves(node, 1);
773
node->index_item_leaves--;
774
} else if (!next) {
775
/*
776
* There are no more items to insert. We can have a number of
777
* reserved leaves > 1 here - this happens when many dir index
778
* items are added and then removed before they are flushed (file
779
* names with a very short life, never span a transaction). So
780
* release all remaining leaves.
781
*/
782
btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
783
node->index_item_leaves = 0;
784
}
785
786
list_for_each_entry_safe(curr, next, &item_list, tree_list) {
787
list_del(&curr->tree_list);
788
btrfs_release_delayed_item(curr);
789
}
790
out:
791
kfree(ins_data);
792
return ret;
793
}
794
795
static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
796
struct btrfs_path *path,
797
struct btrfs_root *root,
798
struct btrfs_delayed_node *node)
799
{
800
int ret = 0;
801
802
while (ret == 0) {
803
struct btrfs_delayed_item *curr;
804
805
mutex_lock(&node->mutex);
806
curr = __btrfs_first_delayed_insertion_item(node);
807
if (!curr) {
808
mutex_unlock(&node->mutex);
809
break;
810
}
811
ret = btrfs_insert_delayed_item(trans, root, path, curr);
812
mutex_unlock(&node->mutex);
813
}
814
815
return ret;
816
}
817
818
static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
819
struct btrfs_root *root,
820
struct btrfs_path *path,
821
struct btrfs_delayed_item *item)
822
{
823
const u64 ino = item->delayed_node->inode_id;
824
struct btrfs_fs_info *fs_info = root->fs_info;
825
struct btrfs_delayed_item *curr, *next;
826
struct extent_buffer *leaf = path->nodes[0];
827
LIST_HEAD(batch_list);
828
int nitems, slot, last_slot;
829
int ret;
830
u64 total_reserved_size = item->bytes_reserved;
831
832
ASSERT(leaf != NULL);
833
834
slot = path->slots[0];
835
last_slot = btrfs_header_nritems(leaf) - 1;
836
/*
837
* Our caller always gives us a path pointing to an existing item, so
838
* this can not happen.
839
*/
840
ASSERT(slot <= last_slot);
841
if (WARN_ON(slot > last_slot))
842
return -ENOENT;
843
844
nitems = 1;
845
curr = item;
846
list_add_tail(&curr->tree_list, &batch_list);
847
848
/*
849
* Keep checking if the next delayed item matches the next item in the
850
* leaf - if so, we can add it to the batch of items to delete from the
851
* leaf.
852
*/
853
while (slot < last_slot) {
854
struct btrfs_key key;
855
856
next = __btrfs_next_delayed_item(curr);
857
if (!next)
858
break;
859
860
slot++;
861
btrfs_item_key_to_cpu(leaf, &key, slot);
862
if (key.objectid != ino ||
863
key.type != BTRFS_DIR_INDEX_KEY ||
864
key.offset != next->index)
865
break;
866
nitems++;
867
curr = next;
868
list_add_tail(&curr->tree_list, &batch_list);
869
total_reserved_size += curr->bytes_reserved;
870
}
871
872
ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
873
if (ret)
874
return ret;
875
876
/* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
877
if (total_reserved_size > 0) {
878
/*
879
* Check btrfs_delayed_item_reserve_metadata() to see why we
880
* don't need to release/reserve qgroup space.
881
*/
882
trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
883
total_reserved_size, 0);
884
btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
885
total_reserved_size, NULL);
886
}
887
888
list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
889
list_del(&curr->tree_list);
890
btrfs_release_delayed_item(curr);
891
}
892
893
return 0;
894
}
895
896
static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
897
struct btrfs_path *path,
898
struct btrfs_root *root,
899
struct btrfs_delayed_node *node)
900
{
901
struct btrfs_key key;
902
int ret = 0;
903
904
key.objectid = node->inode_id;
905
key.type = BTRFS_DIR_INDEX_KEY;
906
907
while (ret == 0) {
908
struct btrfs_delayed_item *item;
909
910
mutex_lock(&node->mutex);
911
item = __btrfs_first_delayed_deletion_item(node);
912
if (!item) {
913
mutex_unlock(&node->mutex);
914
break;
915
}
916
917
key.offset = item->index;
918
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
919
if (ret > 0) {
920
/*
921
* There's no matching item in the leaf. This means we
922
* have already deleted this item in a past run of the
923
* delayed items. We ignore errors when running delayed
924
* items from an async context, through a work queue job
925
* running btrfs_async_run_delayed_root(), and don't
926
* release delayed items that failed to complete. This
927
* is because we will retry later, and at transaction
928
* commit time we always run delayed items and will
929
* then deal with errors if they fail to run again.
930
*
931
* So just release delayed items for which we can't find
932
* an item in the tree, and move to the next item.
933
*/
934
btrfs_release_path(path);
935
btrfs_release_delayed_item(item);
936
ret = 0;
937
} else if (ret == 0) {
938
ret = btrfs_batch_delete_items(trans, root, path, item);
939
btrfs_release_path(path);
940
}
941
942
/*
943
* We unlock and relock on each iteration, this is to prevent
944
* blocking other tasks for too long while we are being run from
945
* the async context (work queue job). Those tasks are typically
946
* running system calls like creat/mkdir/rename/unlink/etc which
947
* need to add delayed items to this delayed node.
948
*/
949
mutex_unlock(&node->mutex);
950
}
951
952
return ret;
953
}
954
955
static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
956
{
957
struct btrfs_delayed_root *delayed_root;
958
959
if (delayed_node &&
960
test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
961
ASSERT(delayed_node->root);
962
clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
963
delayed_node->count--;
964
965
delayed_root = delayed_node->root->fs_info->delayed_root;
966
finish_one_item(delayed_root);
967
}
968
}
969
970
static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
971
{
972
973
if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
974
struct btrfs_delayed_root *delayed_root;
975
976
ASSERT(delayed_node->root);
977
delayed_node->count--;
978
979
delayed_root = delayed_node->root->fs_info->delayed_root;
980
finish_one_item(delayed_root);
981
}
982
}
983
984
static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
985
struct btrfs_root *root,
986
struct btrfs_path *path,
987
struct btrfs_delayed_node *node)
988
{
989
struct btrfs_fs_info *fs_info = root->fs_info;
990
struct btrfs_key key;
991
struct btrfs_inode_item *inode_item;
992
struct extent_buffer *leaf;
993
int mod;
994
int ret;
995
996
key.objectid = node->inode_id;
997
key.type = BTRFS_INODE_ITEM_KEY;
998
key.offset = 0;
999
1000
if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1001
mod = -1;
1002
else
1003
mod = 1;
1004
1005
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
1006
if (ret > 0)
1007
ret = -ENOENT;
1008
if (ret < 0) {
1009
/*
1010
* If we fail to update the delayed inode we need to abort the
1011
* transaction, because we could leave the inode with the
1012
* improper counts behind.
1013
*/
1014
if (ret != -ENOENT)
1015
btrfs_abort_transaction(trans, ret);
1016
goto out;
1017
}
1018
1019
leaf = path->nodes[0];
1020
inode_item = btrfs_item_ptr(leaf, path->slots[0],
1021
struct btrfs_inode_item);
1022
write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
1023
sizeof(struct btrfs_inode_item));
1024
1025
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1026
goto out;
1027
1028
/*
1029
* Now we're going to delete the INODE_REF/EXTREF, which should be the
1030
* only one ref left. Check if the next item is an INODE_REF/EXTREF.
1031
*
1032
* But if we're the last item already, release and search for the last
1033
* INODE_REF/EXTREF.
1034
*/
1035
if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
1036
key.objectid = node->inode_id;
1037
key.type = BTRFS_INODE_EXTREF_KEY;
1038
key.offset = (u64)-1;
1039
1040
btrfs_release_path(path);
1041
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1042
if (ret < 0) {
1043
btrfs_abort_transaction(trans, ret);
1044
goto err_out;
1045
}
1046
ASSERT(ret > 0);
1047
ASSERT(path->slots[0] > 0);
1048
ret = 0;
1049
path->slots[0]--;
1050
leaf = path->nodes[0];
1051
} else {
1052
path->slots[0]++;
1053
}
1054
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1055
if (key.objectid != node->inode_id)
1056
goto out;
1057
if (key.type != BTRFS_INODE_REF_KEY &&
1058
key.type != BTRFS_INODE_EXTREF_KEY)
1059
goto out;
1060
1061
/*
1062
* Delayed iref deletion is for the inode who has only one link,
1063
* so there is only one iref. The case that several irefs are
1064
* in the same item doesn't exist.
1065
*/
1066
ret = btrfs_del_item(trans, root, path);
1067
if (ret < 0)
1068
btrfs_abort_transaction(trans, ret);
1069
out:
1070
btrfs_release_delayed_iref(node);
1071
btrfs_release_path(path);
1072
err_out:
1073
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
1074
btrfs_release_delayed_inode(node);
1075
return ret;
1076
}
1077
1078
static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1079
struct btrfs_root *root,
1080
struct btrfs_path *path,
1081
struct btrfs_delayed_node *node)
1082
{
1083
int ret;
1084
1085
mutex_lock(&node->mutex);
1086
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
1087
mutex_unlock(&node->mutex);
1088
return 0;
1089
}
1090
1091
ret = __btrfs_update_delayed_inode(trans, root, path, node);
1092
mutex_unlock(&node->mutex);
1093
return ret;
1094
}
1095
1096
static inline int
1097
__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1098
struct btrfs_path *path,
1099
struct btrfs_delayed_node *node)
1100
{
1101
int ret;
1102
1103
ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1104
if (ret)
1105
return ret;
1106
1107
ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1108
if (ret)
1109
return ret;
1110
1111
ret = btrfs_record_root_in_trans(trans, node->root);
1112
if (ret)
1113
return ret;
1114
ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1115
return ret;
1116
}
1117
1118
/*
1119
* Called when committing the transaction.
1120
* Returns 0 on success.
1121
* Returns < 0 on error and returns with an aborted transaction with any
1122
* outstanding delayed items cleaned up.
1123
*/
1124
static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
1125
{
1126
struct btrfs_fs_info *fs_info = trans->fs_info;
1127
struct btrfs_delayed_root *delayed_root;
1128
struct btrfs_delayed_node *curr_node, *prev_node;
1129
struct btrfs_path *path;
1130
struct btrfs_block_rsv *block_rsv;
1131
int ret = 0;
1132
bool count = (nr > 0);
1133
1134
if (TRANS_ABORTED(trans))
1135
return -EIO;
1136
1137
path = btrfs_alloc_path();
1138
if (!path)
1139
return -ENOMEM;
1140
1141
block_rsv = trans->block_rsv;
1142
trans->block_rsv = &fs_info->delayed_block_rsv;
1143
1144
delayed_root = fs_info->delayed_root;
1145
1146
curr_node = btrfs_first_delayed_node(delayed_root);
1147
while (curr_node && (!count || nr--)) {
1148
ret = __btrfs_commit_inode_delayed_items(trans, path,
1149
curr_node);
1150
if (ret) {
1151
btrfs_abort_transaction(trans, ret);
1152
break;
1153
}
1154
1155
prev_node = curr_node;
1156
curr_node = btrfs_next_delayed_node(curr_node);
1157
/*
1158
* See the comment below about releasing path before releasing
1159
* node. If the commit of delayed items was successful the path
1160
* should always be released, but in case of an error, it may
1161
* point to locked extent buffers (a leaf at the very least).
1162
*/
1163
ASSERT(path->nodes[0] == NULL);
1164
btrfs_release_delayed_node(prev_node);
1165
}
1166
1167
/*
1168
* Release the path to avoid a potential deadlock and lockdep splat when
1169
* releasing the delayed node, as that requires taking the delayed node's
1170
* mutex. If another task starts running delayed items before we take
1171
* the mutex, it will first lock the mutex and then it may try to lock
1172
* the same btree path (leaf).
1173
*/
1174
btrfs_free_path(path);
1175
1176
if (curr_node)
1177
btrfs_release_delayed_node(curr_node);
1178
trans->block_rsv = block_rsv;
1179
1180
return ret;
1181
}
1182
1183
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
1184
{
1185
return __btrfs_run_delayed_items(trans, -1);
1186
}
1187
1188
int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
1189
{
1190
return __btrfs_run_delayed_items(trans, nr);
1191
}
1192
1193
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1194
struct btrfs_inode *inode)
1195
{
1196
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1197
BTRFS_PATH_AUTO_FREE(path);
1198
struct btrfs_block_rsv *block_rsv;
1199
int ret;
1200
1201
if (!delayed_node)
1202
return 0;
1203
1204
mutex_lock(&delayed_node->mutex);
1205
if (!delayed_node->count) {
1206
mutex_unlock(&delayed_node->mutex);
1207
btrfs_release_delayed_node(delayed_node);
1208
return 0;
1209
}
1210
mutex_unlock(&delayed_node->mutex);
1211
1212
path = btrfs_alloc_path();
1213
if (!path) {
1214
btrfs_release_delayed_node(delayed_node);
1215
return -ENOMEM;
1216
}
1217
1218
block_rsv = trans->block_rsv;
1219
trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1220
1221
ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1222
1223
btrfs_release_delayed_node(delayed_node);
1224
trans->block_rsv = block_rsv;
1225
1226
return ret;
1227
}
1228
1229
int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
1230
{
1231
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1232
struct btrfs_trans_handle *trans;
1233
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1234
struct btrfs_path *path;
1235
struct btrfs_block_rsv *block_rsv;
1236
int ret;
1237
1238
if (!delayed_node)
1239
return 0;
1240
1241
mutex_lock(&delayed_node->mutex);
1242
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1243
mutex_unlock(&delayed_node->mutex);
1244
btrfs_release_delayed_node(delayed_node);
1245
return 0;
1246
}
1247
mutex_unlock(&delayed_node->mutex);
1248
1249
trans = btrfs_join_transaction(delayed_node->root);
1250
if (IS_ERR(trans)) {
1251
ret = PTR_ERR(trans);
1252
goto out;
1253
}
1254
1255
path = btrfs_alloc_path();
1256
if (!path) {
1257
ret = -ENOMEM;
1258
goto trans_out;
1259
}
1260
1261
block_rsv = trans->block_rsv;
1262
trans->block_rsv = &fs_info->delayed_block_rsv;
1263
1264
mutex_lock(&delayed_node->mutex);
1265
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
1266
ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1267
path, delayed_node);
1268
else
1269
ret = 0;
1270
mutex_unlock(&delayed_node->mutex);
1271
1272
btrfs_free_path(path);
1273
trans->block_rsv = block_rsv;
1274
trans_out:
1275
btrfs_end_transaction(trans);
1276
btrfs_btree_balance_dirty(fs_info);
1277
out:
1278
btrfs_release_delayed_node(delayed_node);
1279
1280
return ret;
1281
}
1282
1283
void btrfs_remove_delayed_node(struct btrfs_inode *inode)
1284
{
1285
struct btrfs_delayed_node *delayed_node;
1286
1287
delayed_node = READ_ONCE(inode->delayed_node);
1288
if (!delayed_node)
1289
return;
1290
1291
inode->delayed_node = NULL;
1292
btrfs_release_delayed_node(delayed_node);
1293
}
1294
1295
struct btrfs_async_delayed_work {
1296
struct btrfs_delayed_root *delayed_root;
1297
int nr;
1298
struct btrfs_work work;
1299
};
1300
1301
static void btrfs_async_run_delayed_root(struct btrfs_work *work)
1302
{
1303
struct btrfs_async_delayed_work *async_work;
1304
struct btrfs_delayed_root *delayed_root;
1305
struct btrfs_trans_handle *trans;
1306
struct btrfs_path *path;
1307
struct btrfs_delayed_node *delayed_node = NULL;
1308
struct btrfs_root *root;
1309
struct btrfs_block_rsv *block_rsv;
1310
int total_done = 0;
1311
1312
async_work = container_of(work, struct btrfs_async_delayed_work, work);
1313
delayed_root = async_work->delayed_root;
1314
1315
path = btrfs_alloc_path();
1316
if (!path)
1317
goto out;
1318
1319
do {
1320
if (atomic_read(&delayed_root->items) <
1321
BTRFS_DELAYED_BACKGROUND / 2)
1322
break;
1323
1324
delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
1325
if (!delayed_node)
1326
break;
1327
1328
root = delayed_node->root;
1329
1330
trans = btrfs_join_transaction(root);
1331
if (IS_ERR(trans)) {
1332
btrfs_release_path(path);
1333
btrfs_release_prepared_delayed_node(delayed_node);
1334
total_done++;
1335
continue;
1336
}
1337
1338
block_rsv = trans->block_rsv;
1339
trans->block_rsv = &root->fs_info->delayed_block_rsv;
1340
1341
__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1342
1343
trans->block_rsv = block_rsv;
1344
btrfs_end_transaction(trans);
1345
btrfs_btree_balance_dirty_nodelay(root->fs_info);
1346
1347
btrfs_release_path(path);
1348
btrfs_release_prepared_delayed_node(delayed_node);
1349
total_done++;
1350
1351
} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
1352
|| total_done < async_work->nr);
1353
1354
btrfs_free_path(path);
1355
out:
1356
wake_up(&delayed_root->wait);
1357
kfree(async_work);
1358
}
1359
1360
1361
static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1362
struct btrfs_fs_info *fs_info, int nr)
1363
{
1364
struct btrfs_async_delayed_work *async_work;
1365
1366
async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
1367
if (!async_work)
1368
return -ENOMEM;
1369
1370
async_work->delayed_root = delayed_root;
1371
btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL);
1372
async_work->nr = nr;
1373
1374
btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
1375
return 0;
1376
}
1377
1378
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
1379
{
1380
struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);
1381
1382
if (WARN_ON(node))
1383
refcount_dec(&node->refs);
1384
}
1385
1386
static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
1387
{
1388
int val = atomic_read(&delayed_root->items_seq);
1389
1390
if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
1391
return true;
1392
1393
if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1394
return true;
1395
1396
return false;
1397
}
1398
1399
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
1400
{
1401
struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
1402
1403
if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
1404
btrfs_workqueue_normal_congested(fs_info->delayed_workers))
1405
return;
1406
1407
if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
1408
int seq;
1409
int ret;
1410
1411
seq = atomic_read(&delayed_root->items_seq);
1412
1413
ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
1414
if (ret)
1415
return;
1416
1417
wait_event_interruptible(delayed_root->wait,
1418
could_end_wait(delayed_root, seq));
1419
return;
1420
}
1421
1422
btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
1423
}
1424
1425
static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
1426
{
1427
struct btrfs_fs_info *fs_info = trans->fs_info;
1428
const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
1429
1430
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
1431
return;
1432
1433
/*
1434
* Adding the new dir index item does not require touching another
1435
* leaf, so we can release 1 unit of metadata that was previously
1436
* reserved when starting the transaction. This applies only to
1437
* the case where we had a transaction start and excludes the
1438
* transaction join case (when replaying log trees).
1439
*/
1440
trace_btrfs_space_reservation(fs_info, "transaction",
1441
trans->transid, bytes, 0);
1442
btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
1443
ASSERT(trans->bytes_reserved >= bytes);
1444
trans->bytes_reserved -= bytes;
1445
}
1446
1447
/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
1448
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1449
const char *name, int name_len,
1450
struct btrfs_inode *dir,
1451
const struct btrfs_disk_key *disk_key, u8 flags,
1452
u64 index)
1453
{
1454
struct btrfs_fs_info *fs_info = trans->fs_info;
1455
const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
1456
struct btrfs_delayed_node *delayed_node;
1457
struct btrfs_delayed_item *delayed_item;
1458
struct btrfs_dir_item *dir_item;
1459
bool reserve_leaf_space;
1460
u32 data_len;
1461
int ret;
1462
1463
delayed_node = btrfs_get_or_create_delayed_node(dir);
1464
if (IS_ERR(delayed_node))
1465
return PTR_ERR(delayed_node);
1466
1467
delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
1468
delayed_node,
1469
BTRFS_DELAYED_INSERTION_ITEM);
1470
if (!delayed_item) {
1471
ret = -ENOMEM;
1472
goto release_node;
1473
}
1474
1475
delayed_item->index = index;
1476
1477
dir_item = (struct btrfs_dir_item *)delayed_item->data;
1478
dir_item->location = *disk_key;
1479
btrfs_set_stack_dir_transid(dir_item, trans->transid);
1480
btrfs_set_stack_dir_data_len(dir_item, 0);
1481
btrfs_set_stack_dir_name_len(dir_item, name_len);
1482
btrfs_set_stack_dir_flags(dir_item, flags);
1483
memcpy((char *)(dir_item + 1), name, name_len);
1484
1485
data_len = delayed_item->data_len + sizeof(struct btrfs_item);
1486
1487
mutex_lock(&delayed_node->mutex);
1488
1489
/*
1490
* First attempt to insert the delayed item. This is to make the error
1491
* handling path simpler in case we fail (-EEXIST). There's no risk of
1492
* any other task coming in and running the delayed item before we do
1493
* the metadata space reservation below, because we are holding the
1494
* delayed node's mutex and that mutex must also be locked before the
1495
* node's delayed items can be run.
1496
*/
1497
ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
1498
if (unlikely(ret)) {
1499
btrfs_err(trans->fs_info,
1500
"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d",
1501
name_len, name, index, btrfs_root_id(delayed_node->root),
1502
delayed_node->inode_id, dir->index_cnt,
1503
delayed_node->index_cnt, ret);
1504
btrfs_release_delayed_item(delayed_item);
1505
btrfs_release_dir_index_item_space(trans);
1506
mutex_unlock(&delayed_node->mutex);
1507
goto release_node;
1508
}
1509
1510
if (delayed_node->index_item_leaves == 0 ||
1511
delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
1512
delayed_node->curr_index_batch_size = data_len;
1513
reserve_leaf_space = true;
1514
} else {
1515
delayed_node->curr_index_batch_size += data_len;
1516
reserve_leaf_space = false;
1517
}
1518
1519
if (reserve_leaf_space) {
1520
ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
1521
/*
1522
* Space was reserved for a dir index item insertion when we
1523
* started the transaction, so getting a failure here should be
1524
* impossible.
1525
*/
1526
if (WARN_ON(ret)) {
1527
btrfs_release_delayed_item(delayed_item);
1528
mutex_unlock(&delayed_node->mutex);
1529
goto release_node;
1530
}
1531
1532
delayed_node->index_item_leaves++;
1533
} else {
1534
btrfs_release_dir_index_item_space(trans);
1535
}
1536
mutex_unlock(&delayed_node->mutex);
1537
1538
release_node:
1539
btrfs_release_delayed_node(delayed_node);
1540
return ret;
1541
}
1542
1543
static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
1544
u64 index)
1545
{
1546
struct btrfs_delayed_item *item;
1547
1548
mutex_lock(&node->mutex);
1549
item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
1550
if (!item) {
1551
mutex_unlock(&node->mutex);
1552
return false;
1553
}
1554
1555
/*
1556
* For delayed items to insert, we track reserved metadata bytes based
1557
* on the number of leaves that we will use.
1558
* See btrfs_insert_delayed_dir_index() and
1559
* btrfs_delayed_item_reserve_metadata()).
1560
*/
1561
ASSERT(item->bytes_reserved == 0);
1562
ASSERT(node->index_item_leaves > 0);
1563
1564
/*
1565
* If there's only one leaf reserved, we can decrement this item from the
1566
* current batch, otherwise we can not because we don't know which leaf
1567
* it belongs to. With the current limit on delayed items, we rarely
1568
* accumulate enough dir index items to fill more than one leaf (even
1569
* when using a leaf size of 4K).
1570
*/
1571
if (node->index_item_leaves == 1) {
1572
const u32 data_len = item->data_len + sizeof(struct btrfs_item);
1573
1574
ASSERT(node->curr_index_batch_size >= data_len);
1575
node->curr_index_batch_size -= data_len;
1576
}
1577
1578
btrfs_release_delayed_item(item);
1579
1580
/* If we now have no more dir index items, we can release all leaves. */
1581
if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
1582
btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
1583
node->index_item_leaves = 0;
1584
}
1585
1586
mutex_unlock(&node->mutex);
1587
return true;
1588
}
1589
1590
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1591
struct btrfs_inode *dir, u64 index)
1592
{
1593
struct btrfs_delayed_node *node;
1594
struct btrfs_delayed_item *item;
1595
int ret;
1596
1597
node = btrfs_get_or_create_delayed_node(dir);
1598
if (IS_ERR(node))
1599
return PTR_ERR(node);
1600
1601
if (btrfs_delete_delayed_insertion_item(node, index)) {
1602
ret = 0;
1603
goto end;
1604
}
1605
1606
item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
1607
if (!item) {
1608
ret = -ENOMEM;
1609
goto end;
1610
}
1611
1612
item->index = index;
1613
1614
ret = btrfs_delayed_item_reserve_metadata(trans, item);
1615
/*
1616
* we have reserved enough space when we start a new transaction,
1617
* so reserving metadata failure is impossible.
1618
*/
1619
if (ret < 0) {
1620
btrfs_err(trans->fs_info,
1621
"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d",
1622
index, btrfs_root_id(node->root), node->inode_id, ret);
1623
btrfs_release_delayed_item(item);
1624
goto end;
1625
}
1626
1627
mutex_lock(&node->mutex);
1628
ret = __btrfs_add_delayed_item(node, item);
1629
if (unlikely(ret)) {
1630
btrfs_err(trans->fs_info,
1631
"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d",
1632
index, btrfs_root_id(node->root), node->inode_id, ret);
1633
btrfs_delayed_item_release_metadata(dir->root, item);
1634
btrfs_release_delayed_item(item);
1635
}
1636
mutex_unlock(&node->mutex);
1637
end:
1638
btrfs_release_delayed_node(node);
1639
return ret;
1640
}
1641
1642
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
1643
{
1644
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1645
1646
if (!delayed_node)
1647
return -ENOENT;
1648
1649
/*
1650
* Since we have held i_mutex of this directory, it is impossible that
1651
* a new directory index is added into the delayed node and index_cnt
1652
* is updated now. So we needn't lock the delayed node.
1653
*/
1654
if (!delayed_node->index_cnt) {
1655
btrfs_release_delayed_node(delayed_node);
1656
return -EINVAL;
1657
}
1658
1659
inode->index_cnt = delayed_node->index_cnt;
1660
btrfs_release_delayed_node(delayed_node);
1661
return 0;
1662
}
1663
1664
bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
1665
u64 last_index,
1666
struct list_head *ins_list,
1667
struct list_head *del_list)
1668
{
1669
struct btrfs_delayed_node *delayed_node;
1670
struct btrfs_delayed_item *item;
1671
1672
delayed_node = btrfs_get_delayed_node(inode);
1673
if (!delayed_node)
1674
return false;
1675
1676
/*
1677
* We can only do one readdir with delayed items at a time because of
1678
* item->readdir_list.
1679
*/
1680
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1681
btrfs_inode_lock(inode, 0);
1682
1683
mutex_lock(&delayed_node->mutex);
1684
item = __btrfs_first_delayed_insertion_item(delayed_node);
1685
while (item && item->index <= last_index) {
1686
refcount_inc(&item->refs);
1687
list_add_tail(&item->readdir_list, ins_list);
1688
item = __btrfs_next_delayed_item(item);
1689
}
1690
1691
item = __btrfs_first_delayed_deletion_item(delayed_node);
1692
while (item && item->index <= last_index) {
1693
refcount_inc(&item->refs);
1694
list_add_tail(&item->readdir_list, del_list);
1695
item = __btrfs_next_delayed_item(item);
1696
}
1697
mutex_unlock(&delayed_node->mutex);
1698
/*
1699
* This delayed node is still cached in the btrfs inode, so refs
1700
* must be > 1 now, and we needn't check it is going to be freed
1701
* or not.
1702
*
1703
* Besides that, this function is used to read dir, we do not
1704
* insert/delete delayed items in this period. So we also needn't
1705
* requeue or dequeue this delayed node.
1706
*/
1707
refcount_dec(&delayed_node->refs);
1708
1709
return true;
1710
}
1711
1712
void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
1713
struct list_head *ins_list,
1714
struct list_head *del_list)
1715
{
1716
struct btrfs_delayed_item *curr, *next;
1717
1718
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1719
list_del(&curr->readdir_list);
1720
if (refcount_dec_and_test(&curr->refs))
1721
kfree(curr);
1722
}
1723
1724
list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1725
list_del(&curr->readdir_list);
1726
if (refcount_dec_and_test(&curr->refs))
1727
kfree(curr);
1728
}
1729
1730
/*
1731
* The VFS is going to do up_read(), so we need to downgrade back to a
1732
* read lock.
1733
*/
1734
downgrade_write(&inode->vfs_inode.i_rwsem);
1735
}
1736
1737
bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index)
1738
{
1739
struct btrfs_delayed_item *curr;
1740
bool ret = false;
1741
1742
list_for_each_entry(curr, del_list, readdir_list) {
1743
if (curr->index > index)
1744
break;
1745
if (curr->index == index) {
1746
ret = true;
1747
break;
1748
}
1749
}
1750
return ret;
1751
}
1752
1753
/*
1754
* Read dir info stored in the delayed tree.
1755
*/
1756
bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
1757
const struct list_head *ins_list)
1758
{
1759
struct btrfs_dir_item *di;
1760
struct btrfs_delayed_item *curr, *next;
1761
struct btrfs_key location;
1762
char *name;
1763
int name_len;
1764
unsigned char d_type;
1765
1766
/*
1767
* Changing the data of the delayed item is impossible. So
1768
* we needn't lock them. And we have held i_mutex of the
1769
* directory, nobody can delete any directory indexes now.
1770
*/
1771
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1772
bool over;
1773
1774
list_del(&curr->readdir_list);
1775
1776
if (curr->index < ctx->pos) {
1777
if (refcount_dec_and_test(&curr->refs))
1778
kfree(curr);
1779
continue;
1780
}
1781
1782
ctx->pos = curr->index;
1783
1784
di = (struct btrfs_dir_item *)curr->data;
1785
name = (char *)(di + 1);
1786
name_len = btrfs_stack_dir_name_len(di);
1787
1788
d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
1789
btrfs_disk_key_to_cpu(&location, &di->location);
1790
1791
over = !dir_emit(ctx, name, name_len, location.objectid, d_type);
1792
1793
if (refcount_dec_and_test(&curr->refs))
1794
kfree(curr);
1795
1796
if (over)
1797
return true;
1798
ctx->pos++;
1799
}
1800
return false;
1801
}
1802
1803
static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1804
struct btrfs_inode_item *inode_item,
1805
struct btrfs_inode *inode)
1806
{
1807
struct inode *vfs_inode = &inode->vfs_inode;
1808
u64 flags;
1809
1810
btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode));
1811
btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode));
1812
btrfs_set_stack_inode_size(inode_item, inode->disk_i_size);
1813
btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode);
1814
btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink);
1815
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode));
1816
btrfs_set_stack_inode_generation(inode_item, inode->generation);
1817
btrfs_set_stack_inode_sequence(inode_item,
1818
inode_peek_iversion(vfs_inode));
1819
btrfs_set_stack_inode_transid(inode_item, trans->transid);
1820
btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev);
1821
flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags);
1822
btrfs_set_stack_inode_flags(inode_item, flags);
1823
btrfs_set_stack_inode_block_group(inode_item, 0);
1824
1825
btrfs_set_stack_timespec_sec(&inode_item->atime,
1826
inode_get_atime_sec(vfs_inode));
1827
btrfs_set_stack_timespec_nsec(&inode_item->atime,
1828
inode_get_atime_nsec(vfs_inode));
1829
1830
btrfs_set_stack_timespec_sec(&inode_item->mtime,
1831
inode_get_mtime_sec(vfs_inode));
1832
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
1833
inode_get_mtime_nsec(vfs_inode));
1834
1835
btrfs_set_stack_timespec_sec(&inode_item->ctime,
1836
inode_get_ctime_sec(vfs_inode));
1837
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
1838
inode_get_ctime_nsec(vfs_inode));
1839
1840
btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec);
1841
btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec);
1842
}
1843
1844
int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
1845
{
1846
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1847
struct btrfs_delayed_node *delayed_node;
1848
struct btrfs_inode_item *inode_item;
1849
struct inode *vfs_inode = &inode->vfs_inode;
1850
1851
delayed_node = btrfs_get_delayed_node(inode);
1852
if (!delayed_node)
1853
return -ENOENT;
1854
1855
mutex_lock(&delayed_node->mutex);
1856
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1857
mutex_unlock(&delayed_node->mutex);
1858
btrfs_release_delayed_node(delayed_node);
1859
return -ENOENT;
1860
}
1861
1862
inode_item = &delayed_node->inode_item;
1863
1864
i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
1865
i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
1866
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
1867
btrfs_inode_set_file_extent_range(inode, 0,
1868
round_up(i_size_read(vfs_inode), fs_info->sectorsize));
1869
vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
1870
set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
1871
inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
1872
inode->generation = btrfs_stack_inode_generation(inode_item);
1873
inode->last_trans = btrfs_stack_inode_transid(inode_item);
1874
1875
inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item));
1876
vfs_inode->i_rdev = 0;
1877
*rdev = btrfs_stack_inode_rdev(inode_item);
1878
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
1879
&inode->flags, &inode->ro_flags);
1880
1881
inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime),
1882
btrfs_stack_timespec_nsec(&inode_item->atime));
1883
1884
inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime),
1885
btrfs_stack_timespec_nsec(&inode_item->mtime));
1886
1887
inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime),
1888
btrfs_stack_timespec_nsec(&inode_item->ctime));
1889
1890
inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
1891
inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
1892
1893
vfs_inode->i_generation = inode->generation;
1894
if (S_ISDIR(vfs_inode->i_mode))
1895
inode->index_cnt = (u64)-1;
1896
1897
mutex_unlock(&delayed_node->mutex);
1898
btrfs_release_delayed_node(delayed_node);
1899
return 0;
1900
}
1901
1902
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1903
struct btrfs_inode *inode)
1904
{
1905
struct btrfs_root *root = inode->root;
1906
struct btrfs_delayed_node *delayed_node;
1907
int ret = 0;
1908
1909
delayed_node = btrfs_get_or_create_delayed_node(inode);
1910
if (IS_ERR(delayed_node))
1911
return PTR_ERR(delayed_node);
1912
1913
mutex_lock(&delayed_node->mutex);
1914
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1915
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1916
goto release_node;
1917
}
1918
1919
ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
1920
if (ret)
1921
goto release_node;
1922
1923
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1924
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
1925
delayed_node->count++;
1926
atomic_inc(&root->fs_info->delayed_root->items);
1927
release_node:
1928
mutex_unlock(&delayed_node->mutex);
1929
btrfs_release_delayed_node(delayed_node);
1930
return ret;
1931
}
1932
1933
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
1934
{
1935
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1936
struct btrfs_delayed_node *delayed_node;
1937
1938
/*
1939
* we don't do delayed inode updates during log recovery because it
1940
* leads to enospc problems. This means we also can't do
1941
* delayed inode refs
1942
*/
1943
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
1944
return -EAGAIN;
1945
1946
delayed_node = btrfs_get_or_create_delayed_node(inode);
1947
if (IS_ERR(delayed_node))
1948
return PTR_ERR(delayed_node);
1949
1950
/*
1951
* We don't reserve space for inode ref deletion is because:
1952
* - We ONLY do async inode ref deletion for the inode who has only
1953
* one link(i_nlink == 1), it means there is only one inode ref.
1954
* And in most case, the inode ref and the inode item are in the
1955
* same leaf, and we will deal with them at the same time.
1956
* Since we are sure we will reserve the space for the inode item,
1957
* it is unnecessary to reserve space for inode ref deletion.
1958
* - If the inode ref and the inode item are not in the same leaf,
1959
* We also needn't worry about enospc problem, because we reserve
1960
* much more space for the inode update than it needs.
1961
* - At the worst, we can steal some space from the global reservation.
1962
* It is very rare.
1963
*/
1964
mutex_lock(&delayed_node->mutex);
1965
if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1966
goto release_node;
1967
1968
set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1969
delayed_node->count++;
1970
atomic_inc(&fs_info->delayed_root->items);
1971
release_node:
1972
mutex_unlock(&delayed_node->mutex);
1973
btrfs_release_delayed_node(delayed_node);
1974
return 0;
1975
}
1976
1977
static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1978
{
1979
struct btrfs_root *root = delayed_node->root;
1980
struct btrfs_fs_info *fs_info = root->fs_info;
1981
struct btrfs_delayed_item *curr_item, *prev_item;
1982
1983
mutex_lock(&delayed_node->mutex);
1984
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
1985
while (curr_item) {
1986
prev_item = curr_item;
1987
curr_item = __btrfs_next_delayed_item(prev_item);
1988
btrfs_release_delayed_item(prev_item);
1989
}
1990
1991
if (delayed_node->index_item_leaves > 0) {
1992
btrfs_delayed_item_release_leaves(delayed_node,
1993
delayed_node->index_item_leaves);
1994
delayed_node->index_item_leaves = 0;
1995
}
1996
1997
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
1998
while (curr_item) {
1999
btrfs_delayed_item_release_metadata(root, curr_item);
2000
prev_item = curr_item;
2001
curr_item = __btrfs_next_delayed_item(prev_item);
2002
btrfs_release_delayed_item(prev_item);
2003
}
2004
2005
btrfs_release_delayed_iref(delayed_node);
2006
2007
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
2008
btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
2009
btrfs_release_delayed_inode(delayed_node);
2010
}
2011
mutex_unlock(&delayed_node->mutex);
2012
}
2013
2014
void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
2015
{
2016
struct btrfs_delayed_node *delayed_node;
2017
2018
delayed_node = btrfs_get_delayed_node(inode);
2019
if (!delayed_node)
2020
return;
2021
2022
__btrfs_kill_delayed_node(delayed_node);
2023
btrfs_release_delayed_node(delayed_node);
2024
}
2025
2026
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
2027
{
2028
unsigned long index = 0;
2029
struct btrfs_delayed_node *delayed_nodes[8];
2030
2031
while (1) {
2032
struct btrfs_delayed_node *node;
2033
int count;
2034
2035
xa_lock(&root->delayed_nodes);
2036
if (xa_empty(&root->delayed_nodes)) {
2037
xa_unlock(&root->delayed_nodes);
2038
return;
2039
}
2040
2041
count = 0;
2042
xa_for_each_start(&root->delayed_nodes, index, node, index) {
2043
/*
2044
* Don't increase refs in case the node is dead and
2045
* about to be removed from the tree in the loop below
2046
*/
2047
if (refcount_inc_not_zero(&node->refs)) {
2048
delayed_nodes[count] = node;
2049
count++;
2050
}
2051
if (count >= ARRAY_SIZE(delayed_nodes))
2052
break;
2053
}
2054
xa_unlock(&root->delayed_nodes);
2055
index++;
2056
2057
for (int i = 0; i < count; i++) {
2058
__btrfs_kill_delayed_node(delayed_nodes[i]);
2059
btrfs_release_delayed_node(delayed_nodes[i]);
2060
}
2061
}
2062
}
2063
2064
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
2065
{
2066
struct btrfs_delayed_node *curr_node, *prev_node;
2067
2068
curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
2069
while (curr_node) {
2070
__btrfs_kill_delayed_node(curr_node);
2071
2072
prev_node = curr_node;
2073
curr_node = btrfs_next_delayed_node(curr_node);
2074
btrfs_release_delayed_node(prev_node);
2075
}
2076
}
2077
2078
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
2079
struct list_head *ins_list,
2080
struct list_head *del_list)
2081
{
2082
struct btrfs_delayed_node *node;
2083
struct btrfs_delayed_item *item;
2084
2085
node = btrfs_get_delayed_node(inode);
2086
if (!node)
2087
return;
2088
2089
mutex_lock(&node->mutex);
2090
item = __btrfs_first_delayed_insertion_item(node);
2091
while (item) {
2092
/*
2093
* It's possible that the item is already in a log list. This
2094
* can happen in case two tasks are trying to log the same
2095
* directory. For example if we have tasks A and task B:
2096
*
2097
* Task A collected the delayed items into a log list while
2098
* under the inode's log_mutex (at btrfs_log_inode()), but it
2099
* only releases the items after logging the inodes they point
2100
* to (if they are new inodes), which happens after unlocking
2101
* the log mutex;
2102
*
2103
* Task B enters btrfs_log_inode() and acquires the log_mutex
2104
* of the same directory inode, before task B releases the
2105
* delayed items. This can happen for example when logging some
2106
* inode we need to trigger logging of its parent directory, so
2107
* logging two files that have the same parent directory can
2108
* lead to this.
2109
*
2110
* If this happens, just ignore delayed items already in a log
2111
* list. All the tasks logging the directory are under a log
2112
* transaction and whichever finishes first can not sync the log
2113
* before the other completes and leaves the log transaction.
2114
*/
2115
if (!item->logged && list_empty(&item->log_list)) {
2116
refcount_inc(&item->refs);
2117
list_add_tail(&item->log_list, ins_list);
2118
}
2119
item = __btrfs_next_delayed_item(item);
2120
}
2121
2122
item = __btrfs_first_delayed_deletion_item(node);
2123
while (item) {
2124
/* It may be non-empty, for the same reason mentioned above. */
2125
if (!item->logged && list_empty(&item->log_list)) {
2126
refcount_inc(&item->refs);
2127
list_add_tail(&item->log_list, del_list);
2128
}
2129
item = __btrfs_next_delayed_item(item);
2130
}
2131
mutex_unlock(&node->mutex);
2132
2133
/*
2134
* We are called during inode logging, which means the inode is in use
2135
* and can not be evicted before we finish logging the inode. So we never
2136
* have the last reference on the delayed inode.
2137
* Also, we don't use btrfs_release_delayed_node() because that would
2138
* requeue the delayed inode (change its order in the list of prepared
2139
* nodes) and we don't want to do such change because we don't create or
2140
* delete delayed items.
2141
*/
2142
ASSERT(refcount_read(&node->refs) > 1);
2143
refcount_dec(&node->refs);
2144
}
2145
2146
void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
2147
struct list_head *ins_list,
2148
struct list_head *del_list)
2149
{
2150
struct btrfs_delayed_node *node;
2151
struct btrfs_delayed_item *item;
2152
struct btrfs_delayed_item *next;
2153
2154
node = btrfs_get_delayed_node(inode);
2155
if (!node)
2156
return;
2157
2158
mutex_lock(&node->mutex);
2159
2160
list_for_each_entry_safe(item, next, ins_list, log_list) {
2161
item->logged = true;
2162
list_del_init(&item->log_list);
2163
if (refcount_dec_and_test(&item->refs))
2164
kfree(item);
2165
}
2166
2167
list_for_each_entry_safe(item, next, del_list, log_list) {
2168
item->logged = true;
2169
list_del_init(&item->log_list);
2170
if (refcount_dec_and_test(&item->refs))
2171
kfree(item);
2172
}
2173
2174
mutex_unlock(&node->mutex);
2175
2176
/*
2177
* We are called during inode logging, which means the inode is in use
2178
* and can not be evicted before we finish logging the inode. So we never
2179
* have the last reference on the delayed inode.
2180
* Also, we don't use btrfs_release_delayed_node() because that would
2181
* requeue the delayed inode (change its order in the list of prepared
2182
* nodes) and we don't want to do such change because we don't create or
2183
* delete delayed items.
2184
*/
2185
ASSERT(refcount_read(&node->refs) > 1);
2186
refcount_dec(&node->refs);
2187
}
2188
2189