Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/fs/btrfs/file.c
15109 views
1
/*
2
* Copyright (C) 2007 Oracle. All rights reserved.
3
*
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public
6
* License v2 as published by the Free Software Foundation.
7
*
8
* This program is distributed in the hope that it will be useful,
9
* but WITHOUT ANY WARRANTY; without even the implied warranty of
10
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
* General Public License for more details.
12
*
13
* You should have received a copy of the GNU General Public
14
* License along with this program; if not, write to the
15
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16
* Boston, MA 021110-1307, USA.
17
*/
18
19
#include <linux/fs.h>
20
#include <linux/pagemap.h>
21
#include <linux/highmem.h>
22
#include <linux/time.h>
23
#include <linux/init.h>
24
#include <linux/string.h>
25
#include <linux/backing-dev.h>
26
#include <linux/mpage.h>
27
#include <linux/falloc.h>
28
#include <linux/swap.h>
29
#include <linux/writeback.h>
30
#include <linux/statfs.h>
31
#include <linux/compat.h>
32
#include <linux/slab.h>
33
#include "ctree.h"
34
#include "disk-io.h"
35
#include "transaction.h"
36
#include "btrfs_inode.h"
37
#include "ioctl.h"
38
#include "print-tree.h"
39
#include "tree-log.h"
40
#include "locking.h"
41
#include "compat.h"
42
43
/*
44
* when auto defrag is enabled we
45
* queue up these defrag structs to remember which
46
* inodes need defragging passes
47
*/
48
struct inode_defrag {
49
struct rb_node rb_node;
50
/* objectid */
51
u64 ino;
52
/*
53
* transid where the defrag was added, we search for
54
* extents newer than this
55
*/
56
u64 transid;
57
58
/* root objectid */
59
u64 root;
60
61
/* last offset we were able to defrag */
62
u64 last_offset;
63
64
/* if we've wrapped around back to zero once already */
65
int cycled;
66
};
67
68
/* pop a record for an inode into the defrag tree. The lock
69
* must be held already
70
*
71
* If you're inserting a record for an older transid than an
72
* existing record, the transid already in the tree is lowered
73
*
74
* If an existing record is found the defrag item you
75
* pass in is freed
76
*/
77
static int __btrfs_add_inode_defrag(struct inode *inode,
78
struct inode_defrag *defrag)
79
{
80
struct btrfs_root *root = BTRFS_I(inode)->root;
81
struct inode_defrag *entry;
82
struct rb_node **p;
83
struct rb_node *parent = NULL;
84
85
p = &root->fs_info->defrag_inodes.rb_node;
86
while (*p) {
87
parent = *p;
88
entry = rb_entry(parent, struct inode_defrag, rb_node);
89
90
if (defrag->ino < entry->ino)
91
p = &parent->rb_left;
92
else if (defrag->ino > entry->ino)
93
p = &parent->rb_right;
94
else {
95
/* if we're reinserting an entry for
96
* an old defrag run, make sure to
97
* lower the transid of our existing record
98
*/
99
if (defrag->transid < entry->transid)
100
entry->transid = defrag->transid;
101
if (defrag->last_offset > entry->last_offset)
102
entry->last_offset = defrag->last_offset;
103
goto exists;
104
}
105
}
106
BTRFS_I(inode)->in_defrag = 1;
107
rb_link_node(&defrag->rb_node, parent, p);
108
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109
return 0;
110
111
exists:
112
kfree(defrag);
113
return 0;
114
115
}
116
117
/*
118
* insert a defrag record for this inode if auto defrag is
119
* enabled
120
*/
121
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
122
struct inode *inode)
123
{
124
struct btrfs_root *root = BTRFS_I(inode)->root;
125
struct inode_defrag *defrag;
126
int ret = 0;
127
u64 transid;
128
129
if (!btrfs_test_opt(root, AUTO_DEFRAG))
130
return 0;
131
132
if (btrfs_fs_closing(root->fs_info))
133
return 0;
134
135
if (BTRFS_I(inode)->in_defrag)
136
return 0;
137
138
if (trans)
139
transid = trans->transid;
140
else
141
transid = BTRFS_I(inode)->root->last_trans;
142
143
defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
144
if (!defrag)
145
return -ENOMEM;
146
147
defrag->ino = btrfs_ino(inode);
148
defrag->transid = transid;
149
defrag->root = root->root_key.objectid;
150
151
spin_lock(&root->fs_info->defrag_inodes_lock);
152
if (!BTRFS_I(inode)->in_defrag)
153
ret = __btrfs_add_inode_defrag(inode, defrag);
154
spin_unlock(&root->fs_info->defrag_inodes_lock);
155
return ret;
156
}
157
158
/*
159
* must be called with the defrag_inodes lock held
160
*/
161
struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
162
struct rb_node **next)
163
{
164
struct inode_defrag *entry = NULL;
165
struct rb_node *p;
166
struct rb_node *parent = NULL;
167
168
p = info->defrag_inodes.rb_node;
169
while (p) {
170
parent = p;
171
entry = rb_entry(parent, struct inode_defrag, rb_node);
172
173
if (ino < entry->ino)
174
p = parent->rb_left;
175
else if (ino > entry->ino)
176
p = parent->rb_right;
177
else
178
return entry;
179
}
180
181
if (next) {
182
while (parent && ino > entry->ino) {
183
parent = rb_next(parent);
184
entry = rb_entry(parent, struct inode_defrag, rb_node);
185
}
186
*next = parent;
187
}
188
return NULL;
189
}
190
191
/*
192
* run through the list of inodes in the FS that need
193
* defragging
194
*/
195
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
196
{
197
struct inode_defrag *defrag;
198
struct btrfs_root *inode_root;
199
struct inode *inode;
200
struct rb_node *n;
201
struct btrfs_key key;
202
struct btrfs_ioctl_defrag_range_args range;
203
u64 first_ino = 0;
204
int num_defrag;
205
int defrag_batch = 1024;
206
207
memset(&range, 0, sizeof(range));
208
range.len = (u64)-1;
209
210
atomic_inc(&fs_info->defrag_running);
211
spin_lock(&fs_info->defrag_inodes_lock);
212
while(1) {
213
n = NULL;
214
215
/* find an inode to defrag */
216
defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
217
if (!defrag) {
218
if (n)
219
defrag = rb_entry(n, struct inode_defrag, rb_node);
220
else if (first_ino) {
221
first_ino = 0;
222
continue;
223
} else {
224
break;
225
}
226
}
227
228
/* remove it from the rbtree */
229
first_ino = defrag->ino + 1;
230
rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
231
232
if (btrfs_fs_closing(fs_info))
233
goto next_free;
234
235
spin_unlock(&fs_info->defrag_inodes_lock);
236
237
/* get the inode */
238
key.objectid = defrag->root;
239
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
240
key.offset = (u64)-1;
241
inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
242
if (IS_ERR(inode_root))
243
goto next;
244
245
key.objectid = defrag->ino;
246
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
247
key.offset = 0;
248
249
inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
250
if (IS_ERR(inode))
251
goto next;
252
253
/* do a chunk of defrag */
254
BTRFS_I(inode)->in_defrag = 0;
255
range.start = defrag->last_offset;
256
num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
257
defrag_batch);
258
/*
259
* if we filled the whole defrag batch, there
260
* must be more work to do. Queue this defrag
261
* again
262
*/
263
if (num_defrag == defrag_batch) {
264
defrag->last_offset = range.start;
265
__btrfs_add_inode_defrag(inode, defrag);
266
/*
267
* we don't want to kfree defrag, we added it back to
268
* the rbtree
269
*/
270
defrag = NULL;
271
} else if (defrag->last_offset && !defrag->cycled) {
272
/*
273
* we didn't fill our defrag batch, but
274
* we didn't start at zero. Make sure we loop
275
* around to the start of the file.
276
*/
277
defrag->last_offset = 0;
278
defrag->cycled = 1;
279
__btrfs_add_inode_defrag(inode, defrag);
280
defrag = NULL;
281
}
282
283
iput(inode);
284
next:
285
spin_lock(&fs_info->defrag_inodes_lock);
286
next_free:
287
kfree(defrag);
288
}
289
spin_unlock(&fs_info->defrag_inodes_lock);
290
291
atomic_dec(&fs_info->defrag_running);
292
293
/*
294
* during unmount, we use the transaction_wait queue to
295
* wait for the defragger to stop
296
*/
297
wake_up(&fs_info->transaction_wait);
298
return 0;
299
}
300
301
/* simple helper to fault in pages and copy. This should go away
302
* and be replaced with calls into generic code.
303
*/
304
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
305
size_t write_bytes,
306
struct page **prepared_pages,
307
struct iov_iter *i)
308
{
309
size_t copied = 0;
310
size_t total_copied = 0;
311
int pg = 0;
312
int offset = pos & (PAGE_CACHE_SIZE - 1);
313
314
while (write_bytes > 0) {
315
size_t count = min_t(size_t,
316
PAGE_CACHE_SIZE - offset, write_bytes);
317
struct page *page = prepared_pages[pg];
318
/*
319
* Copy data from userspace to the current page
320
*
321
* Disable pagefault to avoid recursive lock since
322
* the pages are already locked
323
*/
324
pagefault_disable();
325
copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
326
pagefault_enable();
327
328
/* Flush processor's dcache for this page */
329
flush_dcache_page(page);
330
331
/*
332
* if we get a partial write, we can end up with
333
* partially up to date pages. These add
334
* a lot of complexity, so make sure they don't
335
* happen by forcing this copy to be retried.
336
*
337
* The rest of the btrfs_file_write code will fall
338
* back to page at a time copies after we return 0.
339
*/
340
if (!PageUptodate(page) && copied < count)
341
copied = 0;
342
343
iov_iter_advance(i, copied);
344
write_bytes -= copied;
345
total_copied += copied;
346
347
/* Return to btrfs_file_aio_write to fault page */
348
if (unlikely(copied == 0))
349
break;
350
351
if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
352
offset += copied;
353
} else {
354
pg++;
355
offset = 0;
356
}
357
}
358
return total_copied;
359
}
360
361
/*
362
* unlocks pages after btrfs_file_write is done with them
363
*/
364
void btrfs_drop_pages(struct page **pages, size_t num_pages)
365
{
366
size_t i;
367
for (i = 0; i < num_pages; i++) {
368
/* page checked is some magic around finding pages that
369
* have been modified without going through btrfs_set_page_dirty
370
* clear it here
371
*/
372
ClearPageChecked(pages[i]);
373
unlock_page(pages[i]);
374
mark_page_accessed(pages[i]);
375
page_cache_release(pages[i]);
376
}
377
}
378
379
/*
380
* after copy_from_user, pages need to be dirtied and we need to make
381
* sure holes are created between the current EOF and the start of
382
* any next extents (if required).
383
*
384
* this also makes the decision about creating an inline extent vs
385
* doing real data extents, marking pages dirty and delalloc as required.
386
*/
387
int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
388
struct page **pages, size_t num_pages,
389
loff_t pos, size_t write_bytes,
390
struct extent_state **cached)
391
{
392
int err = 0;
393
int i;
394
u64 num_bytes;
395
u64 start_pos;
396
u64 end_of_last_block;
397
u64 end_pos = pos + write_bytes;
398
loff_t isize = i_size_read(inode);
399
400
start_pos = pos & ~((u64)root->sectorsize - 1);
401
num_bytes = (write_bytes + pos - start_pos +
402
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
403
404
end_of_last_block = start_pos + num_bytes - 1;
405
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
406
cached);
407
if (err)
408
return err;
409
410
for (i = 0; i < num_pages; i++) {
411
struct page *p = pages[i];
412
SetPageUptodate(p);
413
ClearPageChecked(p);
414
set_page_dirty(p);
415
}
416
417
/*
418
* we've only changed i_size in ram, and we haven't updated
419
* the disk i_size. There is no need to log the inode
420
* at this time.
421
*/
422
if (end_pos > isize)
423
i_size_write(inode, end_pos);
424
return 0;
425
}
426
427
/*
428
* this drops all the extents in the cache that intersect the range
429
* [start, end]. Existing extents are split as required.
430
*/
431
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
432
int skip_pinned)
433
{
434
struct extent_map *em;
435
struct extent_map *split = NULL;
436
struct extent_map *split2 = NULL;
437
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
438
u64 len = end - start + 1;
439
int ret;
440
int testend = 1;
441
unsigned long flags;
442
int compressed = 0;
443
444
WARN_ON(end < start);
445
if (end == (u64)-1) {
446
len = (u64)-1;
447
testend = 0;
448
}
449
while (1) {
450
if (!split)
451
split = alloc_extent_map();
452
if (!split2)
453
split2 = alloc_extent_map();
454
BUG_ON(!split || !split2);
455
456
write_lock(&em_tree->lock);
457
em = lookup_extent_mapping(em_tree, start, len);
458
if (!em) {
459
write_unlock(&em_tree->lock);
460
break;
461
}
462
flags = em->flags;
463
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
464
if (testend && em->start + em->len >= start + len) {
465
free_extent_map(em);
466
write_unlock(&em_tree->lock);
467
break;
468
}
469
start = em->start + em->len;
470
if (testend)
471
len = start + len - (em->start + em->len);
472
free_extent_map(em);
473
write_unlock(&em_tree->lock);
474
continue;
475
}
476
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
477
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
478
remove_extent_mapping(em_tree, em);
479
480
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
481
em->start < start) {
482
split->start = em->start;
483
split->len = start - em->start;
484
split->orig_start = em->orig_start;
485
split->block_start = em->block_start;
486
487
if (compressed)
488
split->block_len = em->block_len;
489
else
490
split->block_len = split->len;
491
492
split->bdev = em->bdev;
493
split->flags = flags;
494
split->compress_type = em->compress_type;
495
ret = add_extent_mapping(em_tree, split);
496
BUG_ON(ret);
497
free_extent_map(split);
498
split = split2;
499
split2 = NULL;
500
}
501
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
502
testend && em->start + em->len > start + len) {
503
u64 diff = start + len - em->start;
504
505
split->start = start + len;
506
split->len = em->start + em->len - (start + len);
507
split->bdev = em->bdev;
508
split->flags = flags;
509
split->compress_type = em->compress_type;
510
511
if (compressed) {
512
split->block_len = em->block_len;
513
split->block_start = em->block_start;
514
split->orig_start = em->orig_start;
515
} else {
516
split->block_len = split->len;
517
split->block_start = em->block_start + diff;
518
split->orig_start = split->start;
519
}
520
521
ret = add_extent_mapping(em_tree, split);
522
BUG_ON(ret);
523
free_extent_map(split);
524
split = NULL;
525
}
526
write_unlock(&em_tree->lock);
527
528
/* once for us */
529
free_extent_map(em);
530
/* once for the tree*/
531
free_extent_map(em);
532
}
533
if (split)
534
free_extent_map(split);
535
if (split2)
536
free_extent_map(split2);
537
return 0;
538
}
539
540
/*
541
* this is very complex, but the basic idea is to drop all extents
542
* in the range start - end. hint_block is filled in with a block number
543
* that would be a good hint to the block allocator for this file.
544
*
545
* If an extent intersects the range but is not entirely inside the range
546
* it is either truncated or split. Anything entirely inside the range
547
* is deleted from the tree.
548
*/
549
int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
550
u64 start, u64 end, u64 *hint_byte, int drop_cache)
551
{
552
struct btrfs_root *root = BTRFS_I(inode)->root;
553
struct extent_buffer *leaf;
554
struct btrfs_file_extent_item *fi;
555
struct btrfs_path *path;
556
struct btrfs_key key;
557
struct btrfs_key new_key;
558
u64 ino = btrfs_ino(inode);
559
u64 search_start = start;
560
u64 disk_bytenr = 0;
561
u64 num_bytes = 0;
562
u64 extent_offset = 0;
563
u64 extent_end = 0;
564
int del_nr = 0;
565
int del_slot = 0;
566
int extent_type;
567
int recow;
568
int ret;
569
570
if (drop_cache)
571
btrfs_drop_extent_cache(inode, start, end - 1, 0);
572
573
path = btrfs_alloc_path();
574
if (!path)
575
return -ENOMEM;
576
577
while (1) {
578
recow = 0;
579
ret = btrfs_lookup_file_extent(trans, root, path, ino,
580
search_start, -1);
581
if (ret < 0)
582
break;
583
if (ret > 0 && path->slots[0] > 0 && search_start == start) {
584
leaf = path->nodes[0];
585
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
586
if (key.objectid == ino &&
587
key.type == BTRFS_EXTENT_DATA_KEY)
588
path->slots[0]--;
589
}
590
ret = 0;
591
next_slot:
592
leaf = path->nodes[0];
593
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
594
BUG_ON(del_nr > 0);
595
ret = btrfs_next_leaf(root, path);
596
if (ret < 0)
597
break;
598
if (ret > 0) {
599
ret = 0;
600
break;
601
}
602
leaf = path->nodes[0];
603
recow = 1;
604
}
605
606
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
607
if (key.objectid > ino ||
608
key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
609
break;
610
611
fi = btrfs_item_ptr(leaf, path->slots[0],
612
struct btrfs_file_extent_item);
613
extent_type = btrfs_file_extent_type(leaf, fi);
614
615
if (extent_type == BTRFS_FILE_EXTENT_REG ||
616
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
617
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
618
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
619
extent_offset = btrfs_file_extent_offset(leaf, fi);
620
extent_end = key.offset +
621
btrfs_file_extent_num_bytes(leaf, fi);
622
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
623
extent_end = key.offset +
624
btrfs_file_extent_inline_len(leaf, fi);
625
} else {
626
WARN_ON(1);
627
extent_end = search_start;
628
}
629
630
if (extent_end <= search_start) {
631
path->slots[0]++;
632
goto next_slot;
633
}
634
635
search_start = max(key.offset, start);
636
if (recow) {
637
btrfs_release_path(path);
638
continue;
639
}
640
641
/*
642
* | - range to drop - |
643
* | -------- extent -------- |
644
*/
645
if (start > key.offset && end < extent_end) {
646
BUG_ON(del_nr > 0);
647
BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
648
649
memcpy(&new_key, &key, sizeof(new_key));
650
new_key.offset = start;
651
ret = btrfs_duplicate_item(trans, root, path,
652
&new_key);
653
if (ret == -EAGAIN) {
654
btrfs_release_path(path);
655
continue;
656
}
657
if (ret < 0)
658
break;
659
660
leaf = path->nodes[0];
661
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
662
struct btrfs_file_extent_item);
663
btrfs_set_file_extent_num_bytes(leaf, fi,
664
start - key.offset);
665
666
fi = btrfs_item_ptr(leaf, path->slots[0],
667
struct btrfs_file_extent_item);
668
669
extent_offset += start - key.offset;
670
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
671
btrfs_set_file_extent_num_bytes(leaf, fi,
672
extent_end - start);
673
btrfs_mark_buffer_dirty(leaf);
674
675
if (disk_bytenr > 0) {
676
ret = btrfs_inc_extent_ref(trans, root,
677
disk_bytenr, num_bytes, 0,
678
root->root_key.objectid,
679
new_key.objectid,
680
start - extent_offset);
681
BUG_ON(ret);
682
*hint_byte = disk_bytenr;
683
}
684
key.offset = start;
685
}
686
/*
687
* | ---- range to drop ----- |
688
* | -------- extent -------- |
689
*/
690
if (start <= key.offset && end < extent_end) {
691
BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
692
693
memcpy(&new_key, &key, sizeof(new_key));
694
new_key.offset = end;
695
btrfs_set_item_key_safe(trans, root, path, &new_key);
696
697
extent_offset += end - key.offset;
698
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
699
btrfs_set_file_extent_num_bytes(leaf, fi,
700
extent_end - end);
701
btrfs_mark_buffer_dirty(leaf);
702
if (disk_bytenr > 0) {
703
inode_sub_bytes(inode, end - key.offset);
704
*hint_byte = disk_bytenr;
705
}
706
break;
707
}
708
709
search_start = extent_end;
710
/*
711
* | ---- range to drop ----- |
712
* | -------- extent -------- |
713
*/
714
if (start > key.offset && end >= extent_end) {
715
BUG_ON(del_nr > 0);
716
BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
717
718
btrfs_set_file_extent_num_bytes(leaf, fi,
719
start - key.offset);
720
btrfs_mark_buffer_dirty(leaf);
721
if (disk_bytenr > 0) {
722
inode_sub_bytes(inode, extent_end - start);
723
*hint_byte = disk_bytenr;
724
}
725
if (end == extent_end)
726
break;
727
728
path->slots[0]++;
729
goto next_slot;
730
}
731
732
/*
733
* | ---- range to drop ----- |
734
* | ------ extent ------ |
735
*/
736
if (start <= key.offset && end >= extent_end) {
737
if (del_nr == 0) {
738
del_slot = path->slots[0];
739
del_nr = 1;
740
} else {
741
BUG_ON(del_slot + del_nr != path->slots[0]);
742
del_nr++;
743
}
744
745
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
746
inode_sub_bytes(inode,
747
extent_end - key.offset);
748
extent_end = ALIGN(extent_end,
749
root->sectorsize);
750
} else if (disk_bytenr > 0) {
751
ret = btrfs_free_extent(trans, root,
752
disk_bytenr, num_bytes, 0,
753
root->root_key.objectid,
754
key.objectid, key.offset -
755
extent_offset);
756
BUG_ON(ret);
757
inode_sub_bytes(inode,
758
extent_end - key.offset);
759
*hint_byte = disk_bytenr;
760
}
761
762
if (end == extent_end)
763
break;
764
765
if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
766
path->slots[0]++;
767
goto next_slot;
768
}
769
770
ret = btrfs_del_items(trans, root, path, del_slot,
771
del_nr);
772
BUG_ON(ret);
773
774
del_nr = 0;
775
del_slot = 0;
776
777
btrfs_release_path(path);
778
continue;
779
}
780
781
BUG_ON(1);
782
}
783
784
if (del_nr > 0) {
785
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
786
BUG_ON(ret);
787
}
788
789
btrfs_free_path(path);
790
return ret;
791
}
792
793
static int extent_mergeable(struct extent_buffer *leaf, int slot,
794
u64 objectid, u64 bytenr, u64 orig_offset,
795
u64 *start, u64 *end)
796
{
797
struct btrfs_file_extent_item *fi;
798
struct btrfs_key key;
799
u64 extent_end;
800
801
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
802
return 0;
803
804
btrfs_item_key_to_cpu(leaf, &key, slot);
805
if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
806
return 0;
807
808
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
809
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
810
btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
811
btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
812
btrfs_file_extent_compression(leaf, fi) ||
813
btrfs_file_extent_encryption(leaf, fi) ||
814
btrfs_file_extent_other_encoding(leaf, fi))
815
return 0;
816
817
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
818
if ((*start && *start != key.offset) || (*end && *end != extent_end))
819
return 0;
820
821
*start = key.offset;
822
*end = extent_end;
823
return 1;
824
}
825
826
/*
827
* Mark extent in the range start - end as written.
828
*
829
* This changes extent type from 'pre-allocated' to 'regular'. If only
830
* part of extent is marked as written, the extent will be split into
831
* two or three.
832
*/
833
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
834
struct inode *inode, u64 start, u64 end)
835
{
836
struct btrfs_root *root = BTRFS_I(inode)->root;
837
struct extent_buffer *leaf;
838
struct btrfs_path *path;
839
struct btrfs_file_extent_item *fi;
840
struct btrfs_key key;
841
struct btrfs_key new_key;
842
u64 bytenr;
843
u64 num_bytes;
844
u64 extent_end;
845
u64 orig_offset;
846
u64 other_start;
847
u64 other_end;
848
u64 split;
849
int del_nr = 0;
850
int del_slot = 0;
851
int recow;
852
int ret;
853
u64 ino = btrfs_ino(inode);
854
855
btrfs_drop_extent_cache(inode, start, end - 1, 0);
856
857
path = btrfs_alloc_path();
858
BUG_ON(!path);
859
again:
860
recow = 0;
861
split = start;
862
key.objectid = ino;
863
key.type = BTRFS_EXTENT_DATA_KEY;
864
key.offset = split;
865
866
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
867
if (ret < 0)
868
goto out;
869
if (ret > 0 && path->slots[0] > 0)
870
path->slots[0]--;
871
872
leaf = path->nodes[0];
873
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
874
BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
875
fi = btrfs_item_ptr(leaf, path->slots[0],
876
struct btrfs_file_extent_item);
877
BUG_ON(btrfs_file_extent_type(leaf, fi) !=
878
BTRFS_FILE_EXTENT_PREALLOC);
879
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
880
BUG_ON(key.offset > start || extent_end < end);
881
882
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
883
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
884
orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
885
memcpy(&new_key, &key, sizeof(new_key));
886
887
if (start == key.offset && end < extent_end) {
888
other_start = 0;
889
other_end = start;
890
if (extent_mergeable(leaf, path->slots[0] - 1,
891
ino, bytenr, orig_offset,
892
&other_start, &other_end)) {
893
new_key.offset = end;
894
btrfs_set_item_key_safe(trans, root, path, &new_key);
895
fi = btrfs_item_ptr(leaf, path->slots[0],
896
struct btrfs_file_extent_item);
897
btrfs_set_file_extent_num_bytes(leaf, fi,
898
extent_end - end);
899
btrfs_set_file_extent_offset(leaf, fi,
900
end - orig_offset);
901
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
902
struct btrfs_file_extent_item);
903
btrfs_set_file_extent_num_bytes(leaf, fi,
904
end - other_start);
905
btrfs_mark_buffer_dirty(leaf);
906
goto out;
907
}
908
}
909
910
if (start > key.offset && end == extent_end) {
911
other_start = end;
912
other_end = 0;
913
if (extent_mergeable(leaf, path->slots[0] + 1,
914
ino, bytenr, orig_offset,
915
&other_start, &other_end)) {
916
fi = btrfs_item_ptr(leaf, path->slots[0],
917
struct btrfs_file_extent_item);
918
btrfs_set_file_extent_num_bytes(leaf, fi,
919
start - key.offset);
920
path->slots[0]++;
921
new_key.offset = start;
922
btrfs_set_item_key_safe(trans, root, path, &new_key);
923
924
fi = btrfs_item_ptr(leaf, path->slots[0],
925
struct btrfs_file_extent_item);
926
btrfs_set_file_extent_num_bytes(leaf, fi,
927
other_end - start);
928
btrfs_set_file_extent_offset(leaf, fi,
929
start - orig_offset);
930
btrfs_mark_buffer_dirty(leaf);
931
goto out;
932
}
933
}
934
935
while (start > key.offset || end < extent_end) {
936
if (key.offset == start)
937
split = end;
938
939
new_key.offset = split;
940
ret = btrfs_duplicate_item(trans, root, path, &new_key);
941
if (ret == -EAGAIN) {
942
btrfs_release_path(path);
943
goto again;
944
}
945
BUG_ON(ret < 0);
946
947
leaf = path->nodes[0];
948
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
949
struct btrfs_file_extent_item);
950
btrfs_set_file_extent_num_bytes(leaf, fi,
951
split - key.offset);
952
953
fi = btrfs_item_ptr(leaf, path->slots[0],
954
struct btrfs_file_extent_item);
955
956
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
957
btrfs_set_file_extent_num_bytes(leaf, fi,
958
extent_end - split);
959
btrfs_mark_buffer_dirty(leaf);
960
961
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
962
root->root_key.objectid,
963
ino, orig_offset);
964
BUG_ON(ret);
965
966
if (split == start) {
967
key.offset = start;
968
} else {
969
BUG_ON(start != key.offset);
970
path->slots[0]--;
971
extent_end = end;
972
}
973
recow = 1;
974
}
975
976
other_start = end;
977
other_end = 0;
978
if (extent_mergeable(leaf, path->slots[0] + 1,
979
ino, bytenr, orig_offset,
980
&other_start, &other_end)) {
981
if (recow) {
982
btrfs_release_path(path);
983
goto again;
984
}
985
extent_end = other_end;
986
del_slot = path->slots[0] + 1;
987
del_nr++;
988
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
989
0, root->root_key.objectid,
990
ino, orig_offset);
991
BUG_ON(ret);
992
}
993
other_start = 0;
994
other_end = start;
995
if (extent_mergeable(leaf, path->slots[0] - 1,
996
ino, bytenr, orig_offset,
997
&other_start, &other_end)) {
998
if (recow) {
999
btrfs_release_path(path);
1000
goto again;
1001
}
1002
key.offset = other_start;
1003
del_slot = path->slots[0];
1004
del_nr++;
1005
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1006
0, root->root_key.objectid,
1007
ino, orig_offset);
1008
BUG_ON(ret);
1009
}
1010
if (del_nr == 0) {
1011
fi = btrfs_item_ptr(leaf, path->slots[0],
1012
struct btrfs_file_extent_item);
1013
btrfs_set_file_extent_type(leaf, fi,
1014
BTRFS_FILE_EXTENT_REG);
1015
btrfs_mark_buffer_dirty(leaf);
1016
} else {
1017
fi = btrfs_item_ptr(leaf, del_slot - 1,
1018
struct btrfs_file_extent_item);
1019
btrfs_set_file_extent_type(leaf, fi,
1020
BTRFS_FILE_EXTENT_REG);
1021
btrfs_set_file_extent_num_bytes(leaf, fi,
1022
extent_end - key.offset);
1023
btrfs_mark_buffer_dirty(leaf);
1024
1025
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1026
BUG_ON(ret);
1027
}
1028
out:
1029
btrfs_free_path(path);
1030
return 0;
1031
}
1032
1033
/*
1034
* on error we return an unlocked page and the error value
1035
* on success we return a locked page and 0
1036
*/
1037
static int prepare_uptodate_page(struct page *page, u64 pos)
1038
{
1039
int ret = 0;
1040
1041
if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
1042
ret = btrfs_readpage(NULL, page);
1043
if (ret)
1044
return ret;
1045
lock_page(page);
1046
if (!PageUptodate(page)) {
1047
unlock_page(page);
1048
return -EIO;
1049
}
1050
}
1051
return 0;
1052
}
1053
1054
/*
1055
* this gets pages into the page cache and locks them down, it also properly
1056
* waits for data=ordered extents to finish before allowing the pages to be
1057
* modified.
1058
*/
1059
static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1060
struct page **pages, size_t num_pages,
1061
loff_t pos, unsigned long first_index,
1062
unsigned long last_index, size_t write_bytes)
1063
{
1064
struct extent_state *cached_state = NULL;
1065
int i;
1066
unsigned long index = pos >> PAGE_CACHE_SHIFT;
1067
struct inode *inode = fdentry(file)->d_inode;
1068
int err = 0;
1069
int faili = 0;
1070
u64 start_pos;
1071
u64 last_pos;
1072
1073
start_pos = pos & ~((u64)root->sectorsize - 1);
1074
last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1075
1076
if (start_pos > inode->i_size) {
1077
err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1078
if (err)
1079
return err;
1080
}
1081
1082
again:
1083
for (i = 0; i < num_pages; i++) {
1084
pages[i] = grab_cache_page(inode->i_mapping, index + i);
1085
if (!pages[i]) {
1086
faili = i - 1;
1087
err = -ENOMEM;
1088
goto fail;
1089
}
1090
1091
if (i == 0)
1092
err = prepare_uptodate_page(pages[i], pos);
1093
if (i == num_pages - 1)
1094
err = prepare_uptodate_page(pages[i],
1095
pos + write_bytes);
1096
if (err) {
1097
page_cache_release(pages[i]);
1098
faili = i - 1;
1099
goto fail;
1100
}
1101
wait_on_page_writeback(pages[i]);
1102
}
1103
err = 0;
1104
if (start_pos < inode->i_size) {
1105
struct btrfs_ordered_extent *ordered;
1106
lock_extent_bits(&BTRFS_I(inode)->io_tree,
1107
start_pos, last_pos - 1, 0, &cached_state,
1108
GFP_NOFS);
1109
ordered = btrfs_lookup_first_ordered_extent(inode,
1110
last_pos - 1);
1111
if (ordered &&
1112
ordered->file_offset + ordered->len > start_pos &&
1113
ordered->file_offset < last_pos) {
1114
btrfs_put_ordered_extent(ordered);
1115
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1116
start_pos, last_pos - 1,
1117
&cached_state, GFP_NOFS);
1118
for (i = 0; i < num_pages; i++) {
1119
unlock_page(pages[i]);
1120
page_cache_release(pages[i]);
1121
}
1122
btrfs_wait_ordered_range(inode, start_pos,
1123
last_pos - start_pos);
1124
goto again;
1125
}
1126
if (ordered)
1127
btrfs_put_ordered_extent(ordered);
1128
1129
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1130
last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1131
EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
1132
GFP_NOFS);
1133
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1134
start_pos, last_pos - 1, &cached_state,
1135
GFP_NOFS);
1136
}
1137
for (i = 0; i < num_pages; i++) {
1138
clear_page_dirty_for_io(pages[i]);
1139
set_page_extent_mapped(pages[i]);
1140
WARN_ON(!PageLocked(pages[i]));
1141
}
1142
return 0;
1143
fail:
1144
while (faili >= 0) {
1145
unlock_page(pages[faili]);
1146
page_cache_release(pages[faili]);
1147
faili--;
1148
}
1149
return err;
1150
1151
}
1152
1153
static noinline ssize_t __btrfs_buffered_write(struct file *file,
1154
struct iov_iter *i,
1155
loff_t pos)
1156
{
1157
struct inode *inode = fdentry(file)->d_inode;
1158
struct btrfs_root *root = BTRFS_I(inode)->root;
1159
struct page **pages = NULL;
1160
unsigned long first_index;
1161
unsigned long last_index;
1162
size_t num_written = 0;
1163
int nrptrs;
1164
int ret = 0;
1165
1166
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1167
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1168
(sizeof(struct page *)));
1169
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1170
if (!pages)
1171
return -ENOMEM;
1172
1173
first_index = pos >> PAGE_CACHE_SHIFT;
1174
last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
1175
1176
while (iov_iter_count(i) > 0) {
1177
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1178
size_t write_bytes = min(iov_iter_count(i),
1179
nrptrs * (size_t)PAGE_CACHE_SIZE -
1180
offset);
1181
size_t num_pages = (write_bytes + offset +
1182
PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1183
size_t dirty_pages;
1184
size_t copied;
1185
1186
WARN_ON(num_pages > nrptrs);
1187
1188
/*
1189
* Fault pages before locking them in prepare_pages
1190
* to avoid recursive lock
1191
*/
1192
if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1193
ret = -EFAULT;
1194
break;
1195
}
1196
1197
ret = btrfs_delalloc_reserve_space(inode,
1198
num_pages << PAGE_CACHE_SHIFT);
1199
if (ret)
1200
break;
1201
1202
/*
1203
* This is going to setup the pages array with the number of
1204
* pages we want, so we don't really need to worry about the
1205
* contents of pages from loop to loop
1206
*/
1207
ret = prepare_pages(root, file, pages, num_pages,
1208
pos, first_index, last_index,
1209
write_bytes);
1210
if (ret) {
1211
btrfs_delalloc_release_space(inode,
1212
num_pages << PAGE_CACHE_SHIFT);
1213
break;
1214
}
1215
1216
copied = btrfs_copy_from_user(pos, num_pages,
1217
write_bytes, pages, i);
1218
1219
/*
1220
* if we have trouble faulting in the pages, fall
1221
* back to one page at a time
1222
*/
1223
if (copied < write_bytes)
1224
nrptrs = 1;
1225
1226
if (copied == 0)
1227
dirty_pages = 0;
1228
else
1229
dirty_pages = (copied + offset +
1230
PAGE_CACHE_SIZE - 1) >>
1231
PAGE_CACHE_SHIFT;
1232
1233
/*
1234
* If we had a short copy we need to release the excess delaloc
1235
* bytes we reserved. We need to increment outstanding_extents
1236
* because btrfs_delalloc_release_space will decrement it, but
1237
* we still have an outstanding extent for the chunk we actually
1238
* managed to copy.
1239
*/
1240
if (num_pages > dirty_pages) {
1241
if (copied > 0)
1242
atomic_inc(
1243
&BTRFS_I(inode)->outstanding_extents);
1244
btrfs_delalloc_release_space(inode,
1245
(num_pages - dirty_pages) <<
1246
PAGE_CACHE_SHIFT);
1247
}
1248
1249
if (copied > 0) {
1250
ret = btrfs_dirty_pages(root, inode, pages,
1251
dirty_pages, pos, copied,
1252
NULL);
1253
if (ret) {
1254
btrfs_delalloc_release_space(inode,
1255
dirty_pages << PAGE_CACHE_SHIFT);
1256
btrfs_drop_pages(pages, num_pages);
1257
break;
1258
}
1259
}
1260
1261
btrfs_drop_pages(pages, num_pages);
1262
1263
cond_resched();
1264
1265
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1266
dirty_pages);
1267
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1268
btrfs_btree_balance_dirty(root, 1);
1269
btrfs_throttle(root);
1270
1271
pos += copied;
1272
num_written += copied;
1273
}
1274
1275
kfree(pages);
1276
1277
return num_written ? num_written : ret;
1278
}
1279
1280
static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1281
const struct iovec *iov,
1282
unsigned long nr_segs, loff_t pos,
1283
loff_t *ppos, size_t count, size_t ocount)
1284
{
1285
struct file *file = iocb->ki_filp;
1286
struct inode *inode = fdentry(file)->d_inode;
1287
struct iov_iter i;
1288
ssize_t written;
1289
ssize_t written_buffered;
1290
loff_t endbyte;
1291
int err;
1292
1293
written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1294
count, ocount);
1295
1296
/*
1297
* the generic O_DIRECT will update in-memory i_size after the
1298
* DIOs are done. But our endio handlers that update the on
1299
* disk i_size never update past the in memory i_size. So we
1300
* need one more update here to catch any additions to the
1301
* file
1302
*/
1303
if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
1304
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
1305
mark_inode_dirty(inode);
1306
}
1307
1308
if (written < 0 || written == count)
1309
return written;
1310
1311
pos += written;
1312
count -= written;
1313
iov_iter_init(&i, iov, nr_segs, count, written);
1314
written_buffered = __btrfs_buffered_write(file, &i, pos);
1315
if (written_buffered < 0) {
1316
err = written_buffered;
1317
goto out;
1318
}
1319
endbyte = pos + written_buffered - 1;
1320
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1321
if (err)
1322
goto out;
1323
written += written_buffered;
1324
*ppos = pos + written_buffered;
1325
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1326
endbyte >> PAGE_CACHE_SHIFT);
1327
out:
1328
return written ? written : err;
1329
}
1330
1331
static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1332
const struct iovec *iov,
1333
unsigned long nr_segs, loff_t pos)
1334
{
1335
struct file *file = iocb->ki_filp;
1336
struct inode *inode = fdentry(file)->d_inode;
1337
struct btrfs_root *root = BTRFS_I(inode)->root;
1338
loff_t *ppos = &iocb->ki_pos;
1339
ssize_t num_written = 0;
1340
ssize_t err = 0;
1341
size_t count, ocount;
1342
1343
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1344
1345
mutex_lock(&inode->i_mutex);
1346
1347
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1348
if (err) {
1349
mutex_unlock(&inode->i_mutex);
1350
goto out;
1351
}
1352
count = ocount;
1353
1354
current->backing_dev_info = inode->i_mapping->backing_dev_info;
1355
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1356
if (err) {
1357
mutex_unlock(&inode->i_mutex);
1358
goto out;
1359
}
1360
1361
if (count == 0) {
1362
mutex_unlock(&inode->i_mutex);
1363
goto out;
1364
}
1365
1366
err = file_remove_suid(file);
1367
if (err) {
1368
mutex_unlock(&inode->i_mutex);
1369
goto out;
1370
}
1371
1372
/*
1373
* If BTRFS flips readonly due to some impossible error
1374
* (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1375
* although we have opened a file as writable, we have
1376
* to stop this write operation to ensure FS consistency.
1377
*/
1378
if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1379
mutex_unlock(&inode->i_mutex);
1380
err = -EROFS;
1381
goto out;
1382
}
1383
1384
file_update_time(file);
1385
BTRFS_I(inode)->sequence++;
1386
1387
if (unlikely(file->f_flags & O_DIRECT)) {
1388
num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1389
pos, ppos, count, ocount);
1390
} else {
1391
struct iov_iter i;
1392
1393
iov_iter_init(&i, iov, nr_segs, count, num_written);
1394
1395
num_written = __btrfs_buffered_write(file, &i, pos);
1396
if (num_written > 0)
1397
*ppos = pos + num_written;
1398
}
1399
1400
mutex_unlock(&inode->i_mutex);
1401
1402
/*
1403
* we want to make sure fsync finds this change
1404
* but we haven't joined a transaction running right now.
1405
*
1406
* Later on, someone is sure to update the inode and get the
1407
* real transid recorded.
1408
*
1409
* We set last_trans now to the fs_info generation + 1,
1410
* this will either be one more than the running transaction
1411
* or the generation used for the next transaction if there isn't
1412
* one running right now.
1413
*/
1414
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1415
if (num_written > 0 || num_written == -EIOCBQUEUED) {
1416
err = generic_write_sync(file, pos, num_written);
1417
if (err < 0 && num_written > 0)
1418
num_written = err;
1419
}
1420
out:
1421
current->backing_dev_info = NULL;
1422
return num_written ? num_written : err;
1423
}
1424
1425
int btrfs_release_file(struct inode *inode, struct file *filp)
1426
{
1427
/*
1428
* ordered_data_close is set by settattr when we are about to truncate
1429
* a file from a non-zero size to a zero size. This tries to
1430
* flush down new bytes that may have been written if the
1431
* application were using truncate to replace a file in place.
1432
*/
1433
if (BTRFS_I(inode)->ordered_data_close) {
1434
BTRFS_I(inode)->ordered_data_close = 0;
1435
btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1436
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1437
filemap_flush(inode->i_mapping);
1438
}
1439
if (filp->private_data)
1440
btrfs_ioctl_trans_end(filp);
1441
return 0;
1442
}
1443
1444
/*
1445
* fsync call for both files and directories. This logs the inode into
1446
* the tree log instead of forcing full commits whenever possible.
1447
*
1448
* It needs to call filemap_fdatawait so that all ordered extent updates are
1449
* in the metadata btree are up to date for copying to the log.
1450
*
1451
* It drops the inode mutex before doing the tree log commit. This is an
1452
* important optimization for directories because holding the mutex prevents
1453
* new operations on the dir while we write to disk.
1454
*/
1455
int btrfs_sync_file(struct file *file, int datasync)
1456
{
1457
struct dentry *dentry = file->f_path.dentry;
1458
struct inode *inode = dentry->d_inode;
1459
struct btrfs_root *root = BTRFS_I(inode)->root;
1460
int ret = 0;
1461
struct btrfs_trans_handle *trans;
1462
1463
trace_btrfs_sync_file(file, datasync);
1464
1465
/* we wait first, since the writeback may change the inode */
1466
root->log_batch++;
1467
/* the VFS called filemap_fdatawrite for us */
1468
btrfs_wait_ordered_range(inode, 0, (u64)-1);
1469
root->log_batch++;
1470
1471
/*
1472
* check the transaction that last modified this inode
1473
* and see if its already been committed
1474
*/
1475
if (!BTRFS_I(inode)->last_trans)
1476
goto out;
1477
1478
/*
1479
* if the last transaction that changed this file was before
1480
* the current transaction, we can bail out now without any
1481
* syncing
1482
*/
1483
smp_mb();
1484
if (BTRFS_I(inode)->last_trans <=
1485
root->fs_info->last_trans_committed) {
1486
BTRFS_I(inode)->last_trans = 0;
1487
goto out;
1488
}
1489
1490
/*
1491
* ok we haven't committed the transaction yet, lets do a commit
1492
*/
1493
if (file->private_data)
1494
btrfs_ioctl_trans_end(file);
1495
1496
trans = btrfs_start_transaction(root, 0);
1497
if (IS_ERR(trans)) {
1498
ret = PTR_ERR(trans);
1499
goto out;
1500
}
1501
1502
ret = btrfs_log_dentry_safe(trans, root, dentry);
1503
if (ret < 0)
1504
goto out;
1505
1506
/* we've logged all the items and now have a consistent
1507
* version of the file in the log. It is possible that
1508
* someone will come in and modify the file, but that's
1509
* fine because the log is consistent on disk, and we
1510
* have references to all of the file's extents
1511
*
1512
* It is possible that someone will come in and log the
1513
* file again, but that will end up using the synchronization
1514
* inside btrfs_sync_log to keep things safe.
1515
*/
1516
mutex_unlock(&dentry->d_inode->i_mutex);
1517
1518
if (ret != BTRFS_NO_LOG_SYNC) {
1519
if (ret > 0) {
1520
ret = btrfs_commit_transaction(trans, root);
1521
} else {
1522
ret = btrfs_sync_log(trans, root);
1523
if (ret == 0)
1524
ret = btrfs_end_transaction(trans, root);
1525
else
1526
ret = btrfs_commit_transaction(trans, root);
1527
}
1528
} else {
1529
ret = btrfs_end_transaction(trans, root);
1530
}
1531
mutex_lock(&dentry->d_inode->i_mutex);
1532
out:
1533
return ret > 0 ? -EIO : ret;
1534
}
1535
1536
static const struct vm_operations_struct btrfs_file_vm_ops = {
1537
.fault = filemap_fault,
1538
.page_mkwrite = btrfs_page_mkwrite,
1539
};
1540
1541
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1542
{
1543
struct address_space *mapping = filp->f_mapping;
1544
1545
if (!mapping->a_ops->readpage)
1546
return -ENOEXEC;
1547
1548
file_accessed(filp);
1549
vma->vm_ops = &btrfs_file_vm_ops;
1550
vma->vm_flags |= VM_CAN_NONLINEAR;
1551
1552
return 0;
1553
}
1554
1555
static long btrfs_fallocate(struct file *file, int mode,
1556
loff_t offset, loff_t len)
1557
{
1558
struct inode *inode = file->f_path.dentry->d_inode;
1559
struct extent_state *cached_state = NULL;
1560
u64 cur_offset;
1561
u64 last_byte;
1562
u64 alloc_start;
1563
u64 alloc_end;
1564
u64 alloc_hint = 0;
1565
u64 locked_end;
1566
u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1567
struct extent_map *em;
1568
int ret;
1569
1570
alloc_start = offset & ~mask;
1571
alloc_end = (offset + len + mask) & ~mask;
1572
1573
/* We only support the FALLOC_FL_KEEP_SIZE mode */
1574
if (mode & ~FALLOC_FL_KEEP_SIZE)
1575
return -EOPNOTSUPP;
1576
1577
/*
1578
* wait for ordered IO before we have any locks. We'll loop again
1579
* below with the locks held.
1580
*/
1581
btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1582
1583
mutex_lock(&inode->i_mutex);
1584
ret = inode_newsize_ok(inode, alloc_end);
1585
if (ret)
1586
goto out;
1587
1588
if (alloc_start > inode->i_size) {
1589
ret = btrfs_cont_expand(inode, i_size_read(inode),
1590
alloc_start);
1591
if (ret)
1592
goto out;
1593
}
1594
1595
ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1596
if (ret)
1597
goto out;
1598
1599
locked_end = alloc_end - 1;
1600
while (1) {
1601
struct btrfs_ordered_extent *ordered;
1602
1603
/* the extent lock is ordered inside the running
1604
* transaction
1605
*/
1606
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1607
locked_end, 0, &cached_state, GFP_NOFS);
1608
ordered = btrfs_lookup_first_ordered_extent(inode,
1609
alloc_end - 1);
1610
if (ordered &&
1611
ordered->file_offset + ordered->len > alloc_start &&
1612
ordered->file_offset < alloc_end) {
1613
btrfs_put_ordered_extent(ordered);
1614
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1615
alloc_start, locked_end,
1616
&cached_state, GFP_NOFS);
1617
/*
1618
* we can't wait on the range with the transaction
1619
* running or with the extent lock held
1620
*/
1621
btrfs_wait_ordered_range(inode, alloc_start,
1622
alloc_end - alloc_start);
1623
} else {
1624
if (ordered)
1625
btrfs_put_ordered_extent(ordered);
1626
break;
1627
}
1628
}
1629
1630
cur_offset = alloc_start;
1631
while (1) {
1632
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1633
alloc_end - cur_offset, 0);
1634
BUG_ON(IS_ERR_OR_NULL(em));
1635
last_byte = min(extent_map_end(em), alloc_end);
1636
last_byte = (last_byte + mask) & ~mask;
1637
if (em->block_start == EXTENT_MAP_HOLE ||
1638
(cur_offset >= inode->i_size &&
1639
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1640
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1641
last_byte - cur_offset,
1642
1 << inode->i_blkbits,
1643
offset + len,
1644
&alloc_hint);
1645
if (ret < 0) {
1646
free_extent_map(em);
1647
break;
1648
}
1649
}
1650
free_extent_map(em);
1651
1652
cur_offset = last_byte;
1653
if (cur_offset >= alloc_end) {
1654
ret = 0;
1655
break;
1656
}
1657
}
1658
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1659
&cached_state, GFP_NOFS);
1660
1661
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1662
out:
1663
mutex_unlock(&inode->i_mutex);
1664
return ret;
1665
}
1666
1667
const struct file_operations btrfs_file_operations = {
1668
.llseek = generic_file_llseek,
1669
.read = do_sync_read,
1670
.write = do_sync_write,
1671
.aio_read = generic_file_aio_read,
1672
.splice_read = generic_file_splice_read,
1673
.aio_write = btrfs_file_aio_write,
1674
.mmap = btrfs_file_mmap,
1675
.open = generic_file_open,
1676
.release = btrfs_release_file,
1677
.fsync = btrfs_sync_file,
1678
.fallocate = btrfs_fallocate,
1679
.unlocked_ioctl = btrfs_ioctl,
1680
#ifdef CONFIG_COMPAT
1681
.compat_ioctl = btrfs_ioctl,
1682
#endif
1683
};
1684
1685