CoCalc -- fiemap.c

GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/fiemap.c
²⁶²⁷⁸ views
1
// SPDX-License-Identifier: GPL-2.0
2

3
#include "backref.h"
4
#include "btrfs_inode.h"
5
#include "fiemap.h"
6
#include "file.h"
7
#include "file-item.h"
8

9
struct btrfs_fiemap_entry {
10
	u64 offset;
11
	u64 phys;
12
	u64 len;
13
	u32 flags;
14
};
15

16
/*
17
 * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
18
 * range from the inode's io tree, unlock the subvolume tree search path, flush
19
 * the fiemap cache and relock the file range and research the subvolume tree.
20
 * The value here is something negative that can't be confused with a valid
21
 * errno value and different from 1 because that's also a return value from
22
 * fiemap_fill_next_extent() and also it's often used to mean some btree search
23
 * did not find a key, so make it some distinct negative value.
24
 */
25
#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
26

27
/*
28
 * Used to:
29
 *
30
 * - Cache the next entry to be emitted to the fiemap buffer, so that we can
31
 *   merge extents that are contiguous and can be grouped as a single one;
32
 *
33
 * - Store extents ready to be written to the fiemap buffer in an intermediary
34
 *   buffer. This intermediary buffer is to ensure that in case the fiemap
35
 *   buffer is memory mapped to the fiemap target file, we don't deadlock
36
 *   during btrfs_page_mkwrite(). This is because during fiemap we are locking
37
 *   an extent range in order to prevent races with delalloc flushing and
38
 *   ordered extent completion, which is needed in order to reliably detect
39
 *   delalloc in holes and prealloc extents. And this can lead to a deadlock
40
 *   if the fiemap buffer is memory mapped to the file we are running fiemap
41
 *   against (a silly, useless in practice scenario, but possible) because
42
 *   btrfs_page_mkwrite() will try to lock the same extent range.
43
 */
44
struct fiemap_cache {
45
	/* An array of ready fiemap entries. */
46
	struct btrfs_fiemap_entry *entries;
47
	/* Number of entries in the entries array. */
48
	int entries_size;
49
	/* Index of the next entry in the entries array to write to. */
50
	int entries_pos;
51
	/*
52
	 * Once the entries array is full, this indicates what's the offset for
53
	 * the next file extent item we must search for in the inode's subvolume
54
	 * tree after unlocking the extent range in the inode's io tree and
55
	 * releasing the search path.
56
	 */
57
	u64 next_search_offset;
58
	/*
59
	 * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
60
	 * to count ourselves emitted extents and stop instead of relying on
61
	 * fiemap_fill_next_extent() because we buffer ready fiemap entries at
62
	 * the @entries array, and we want to stop as soon as we hit the max
63
	 * amount of extents to map, not just to save time but also to make the
64
	 * logic at extent_fiemap() simpler.
65
	 */
66
	unsigned int extents_mapped;
67
	/* Fields for the cached extent (unsubmitted, not ready, extent). */
68
	u64 offset;
69
	u64 phys;
70
	u64 len;
71
	u32 flags;
72
	bool cached;
73
};
74

75
static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
76
			      struct fiemap_cache *cache)
77
{
78
	for (int i = 0; i < cache->entries_pos; i++) {
79
		struct btrfs_fiemap_entry *entry = &cache->entries[i];
80
		int ret;
81

82
		ret = fiemap_fill_next_extent(fieinfo, entry->offset,
83
					      entry->phys, entry->len,
84
					      entry->flags);
85
		/*
86
		 * Ignore 1 (reached max entries) because we keep track of that
87
		 * ourselves in emit_fiemap_extent().
88
		 */
89
		if (ret < 0)
90
			return ret;
91
	}
92
	cache->entries_pos = 0;
93

94
	return 0;
95
}
96

97
/*
98
 * Helper to submit fiemap extent.
99
 *
100
 * Will try to merge current fiemap extent specified by @offset, @phys,
101
 * @len and @flags with cached one.
102
 * And only when we fails to merge, cached one will be submitted as
103
 * fiemap extent.
104
 *
105
 * Return value is the same as fiemap_fill_next_extent().
106
 */
107
static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
108
				struct fiemap_cache *cache,
109
				u64 offset, u64 phys, u64 len, u32 flags)
110
{
111
	struct btrfs_fiemap_entry *entry;
112
	u64 cache_end;
113

114
	/* Set at the end of extent_fiemap(). */
115
	ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
116

117
	if (!cache->cached)
118
		goto assign;
119

120
	/*
121
	 * When iterating the extents of the inode, at extent_fiemap(), we may
122
	 * find an extent that starts at an offset behind the end offset of the
123
	 * previous extent we processed. This happens if fiemap is called
124
	 * without FIEMAP_FLAG_SYNC and there are ordered extents completing
125
	 * after we had to unlock the file range, release the search path, emit
126
	 * the fiemap extents stored in the buffer (cache->entries array) and
127
	 * the lock the remainder of the range and re-search the btree.
128
	 *
129
	 * For example we are in leaf X processing its last item, which is the
130
	 * file extent item for file range [512K, 1M[, and after
131
	 * btrfs_next_leaf() releases the path, there's an ordered extent that
132
	 * completes for the file range [768K, 2M[, and that results in trimming
133
	 * the file extent item so that it now corresponds to the file range
134
	 * [512K, 768K[ and a new file extent item is inserted for the file
135
	 * range [768K, 2M[, which may end up as the last item of leaf X or as
136
	 * the first item of the next leaf - in either case btrfs_next_leaf()
137
	 * will leave us with a path pointing to the new extent item, for the
138
	 * file range [768K, 2M[, since that's the first key that follows the
139
	 * last one we processed. So in order not to report overlapping extents
140
	 * to user space, we trim the length of the previously cached extent and
141
	 * emit it.
142
	 *
143
	 * Upon calling btrfs_next_leaf() we may also find an extent with an
144
	 * offset smaller than or equals to cache->offset, and this happens
145
	 * when we had a hole or prealloc extent with several delalloc ranges in
146
	 * it, but after btrfs_next_leaf() released the path, delalloc was
147
	 * flushed and the resulting ordered extents were completed, so we can
148
	 * now have found a file extent item for an offset that is smaller than
149
	 * or equals to what we have in cache->offset. We deal with this as
150
	 * described below.
151
	 */
152
	cache_end = cache->offset + cache->len;
153
	if (cache_end > offset) {
154
		if (offset == cache->offset) {
155
			/*
156
			 * We cached a dealloc range (found in the io tree) for
157
			 * a hole or prealloc extent and we have now found a
158
			 * file extent item for the same offset. What we have
159
			 * now is more recent and up to date, so discard what
160
			 * we had in the cache and use what we have just found.
161
			 */
162
			goto assign;
163
		} else if (offset > cache->offset) {
164
			/*
165
			 * The extent range we previously found ends after the
166
			 * offset of the file extent item we found and that
167
			 * offset falls somewhere in the middle of that previous
168
			 * extent range. So adjust the range we previously found
169
			 * to end at the offset of the file extent item we have
170
			 * just found, since this extent is more up to date.
171
			 * Emit that adjusted range and cache the file extent
172
			 * item we have just found. This corresponds to the case
173
			 * where a previously found file extent item was split
174
			 * due to an ordered extent completing.
175
			 */
176
			cache->len = offset - cache->offset;
177
			goto emit;
178
		} else {
179
			const u64 range_end = offset + len;
180

181
			/*
182
			 * The offset of the file extent item we have just found
183
			 * is behind the cached offset. This means we were
184
			 * processing a hole or prealloc extent for which we
185
			 * have found delalloc ranges (in the io tree), so what
186
			 * we have in the cache is the last delalloc range we
187
			 * found while the file extent item we found can be
188
			 * either for a whole delalloc range we previously
189
			 * emitted or only a part of that range.
190
			 *
191
			 * We have two cases here:
192
			 *
193
			 * 1) The file extent item's range ends at or behind the
194
			 *    cached extent's end. In this case just ignore the
195
			 *    current file extent item because we don't want to
196
			 *    overlap with previous ranges that may have been
197
			 *    emitted already;
198
			 *
199
			 * 2) The file extent item starts behind the currently
200
			 *    cached extent but its end offset goes beyond the
201
			 *    end offset of the cached extent. We don't want to
202
			 *    overlap with a previous range that may have been
203
			 *    emitted already, so we emit the currently cached
204
			 *    extent and then partially store the current file
205
			 *    extent item's range in the cache, for the subrange
206
			 *    going the cached extent's end to the end of the
207
			 *    file extent item.
208
			 */
209
			if (range_end <= cache_end)
210
				return 0;
211

212
			if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
213
				phys += cache_end - offset;
214

215
			offset = cache_end;
216
			len = range_end - cache_end;
217
			goto emit;
218
		}
219
	}
220

221
	/*
222
	 * Only merges fiemap extents if
223
	 * 1) Their logical addresses are continuous
224
	 *
225
	 * 2) Their physical addresses are continuous
226
	 *    So truly compressed (physical size smaller than logical size)
227
	 *    extents won't get merged with each other
228
	 *
229
	 * 3) Share same flags
230
	 */
231
	if (cache->offset + cache->len  == offset &&
232
	    cache->phys + cache->len == phys  &&
233
	    cache->flags == flags) {
234
		cache->len += len;
235
		return 0;
236
	}
237

238
emit:
239
	/* Not mergeable, need to submit cached one */
240

241
	if (cache->entries_pos == cache->entries_size) {
242
		/*
243
		 * We will need to research for the end offset of the last
244
		 * stored extent and not from the current offset, because after
245
		 * unlocking the range and releasing the path, if there's a hole
246
		 * between that end offset and this current offset, a new extent
247
		 * may have been inserted due to a new write, so we don't want
248
		 * to miss it.
249
		 */
250
		entry = &cache->entries[cache->entries_size - 1];
251
		cache->next_search_offset = entry->offset + entry->len;
252
		cache->cached = false;
253

254
		return BTRFS_FIEMAP_FLUSH_CACHE;
255
	}
256

257
	entry = &cache->entries[cache->entries_pos];
258
	entry->offset = cache->offset;
259
	entry->phys = cache->phys;
260
	entry->len = cache->len;
261
	entry->flags = cache->flags;
262
	cache->entries_pos++;
263
	cache->extents_mapped++;
264

265
	if (cache->extents_mapped == fieinfo->fi_extents_max) {
266
		cache->cached = false;
267
		return 1;
268
	}
269
assign:
270
	cache->cached = true;
271
	cache->offset = offset;
272
	cache->phys = phys;
273
	cache->len = len;
274
	cache->flags = flags;
275

276
	return 0;
277
}
278

279
/*
280
 * Emit last fiemap cache
281
 *
282
 * The last fiemap cache may still be cached in the following case:
283
 * 0		      4k		    8k
284
 * |<- Fiemap range ->|
285
 * |<------------  First extent ----------->|
286
 *
287
 * In this case, the first extent range will be cached but not emitted.
288
 * So we must emit it before ending extent_fiemap().
289
 */
290
static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
291
				  struct fiemap_cache *cache)
292
{
293
	int ret;
294

295
	if (!cache->cached)
296
		return 0;
297

298
	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
299
				      cache->len, cache->flags);
300
	cache->cached = false;
301
	if (ret > 0)
302
		ret = 0;
303
	return ret;
304
}
305

306
static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
307
{
308
	struct extent_buffer *clone = path->nodes[0];
309
	struct btrfs_key key;
310
	int slot;
311
	int ret;
312

313
	path->slots[0]++;
314
	if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
315
		return 0;
316

317
	/*
318
	 * Add a temporary extra ref to an already cloned extent buffer to
319
	 * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
320
	 * the cost of allocating a new one.
321
	 */
322
	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
323
	refcount_inc(&clone->refs);
324

325
	ret = btrfs_next_leaf(inode->root, path);
326
	if (ret != 0)
327
		goto out;
328

329
	/*
330
	 * Don't bother with cloning if there are no more file extent items for
331
	 * our inode.
332
	 */
333
	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
334
	if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
335
		ret = 1;
336
		goto out;
337
	}
338

339
	/*
340
	 * Important to preserve the start field, for the optimizations when
341
	 * checking if extents are shared (see extent_fiemap()).
342
	 *
343
	 * We must set ->start before calling copy_extent_buffer_full().  If we
344
	 * are on sub-pagesize blocksize, we use ->start to determine the offset
345
	 * into the folio where our eb exists, and if we update ->start after
346
	 * the fact then any subsequent reads of the eb may read from a
347
	 * different offset in the folio than where we originally copied into.
348
	 */
349
	clone->start = path->nodes[0]->start;
350
	/* See the comment at fiemap_search_slot() about why we clone. */
351
	copy_extent_buffer_full(clone, path->nodes[0]);
352

353
	slot = path->slots[0];
354
	btrfs_release_path(path);
355
	path->nodes[0] = clone;
356
	path->slots[0] = slot;
357
out:
358
	if (ret)
359
		free_extent_buffer(clone);
360

361
	return ret;
362
}
363

364
/*
365
 * Search for the first file extent item that starts at a given file offset or
366
 * the one that starts immediately before that offset.
367
 * Returns: 0 on success, < 0 on error, 1 if not found.
368
 */
369
static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
370
			      u64 file_offset)
371
{
372
	const u64 ino = btrfs_ino(inode);
373
	struct btrfs_root *root = inode->root;
374
	struct extent_buffer *clone;
375
	struct btrfs_key key;
376
	int slot;
377
	int ret;
378

379
	key.objectid = ino;
380
	key.type = BTRFS_EXTENT_DATA_KEY;
381
	key.offset = file_offset;
382

383
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
384
	if (ret < 0)
385
		return ret;
386

387
	if (ret > 0 && path->slots[0] > 0) {
388
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
389
		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
390
			path->slots[0]--;
391
	}
392

393
	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
394
		ret = btrfs_next_leaf(root, path);
395
		if (ret != 0)
396
			return ret;
397

398
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
399
		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
400
			return 1;
401
	}
402

403
	/*
404
	 * We clone the leaf and use it during fiemap. This is because while
405
	 * using the leaf we do expensive things like checking if an extent is
406
	 * shared, which can take a long time. In order to prevent blocking
407
	 * other tasks for too long, we use a clone of the leaf. We have locked
408
	 * the file range in the inode's io tree, so we know none of our file
409
	 * extent items can change. This way we avoid blocking other tasks that
410
	 * want to insert items for other inodes in the same leaf or b+tree
411
	 * rebalance operations (triggered for example when someone is trying
412
	 * to push items into this leaf when trying to insert an item in a
413
	 * neighbour leaf).
414
	 * We also need the private clone because holding a read lock on an
415
	 * extent buffer of the subvolume's b+tree will make lockdep unhappy
416
	 * when we check if extents are shared, as backref walking may need to
417
	 * lock the same leaf we are processing.
418
	 */
419
	clone = btrfs_clone_extent_buffer(path->nodes[0]);
420
	if (!clone)
421
		return -ENOMEM;
422

423
	slot = path->slots[0];
424
	btrfs_release_path(path);
425
	path->nodes[0] = clone;
426
	path->slots[0] = slot;
427

428
	return 0;
429
}
430

431
/*
432
 * Process a range which is a hole or a prealloc extent in the inode's subvolume
433
 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
434
 * extent. The end offset (@end) is inclusive.
435
 */
436
static int fiemap_process_hole(struct btrfs_inode *inode,
437
			       struct fiemap_extent_info *fieinfo,
438
			       struct fiemap_cache *cache,
439
			       struct extent_state **delalloc_cached_state,
440
			       struct btrfs_backref_share_check_ctx *backref_ctx,
441
			       u64 disk_bytenr, u64 extent_offset,
442
			       u64 extent_gen,
443
			       u64 start, u64 end)
444
{
445
	const u64 i_size = i_size_read(&inode->vfs_inode);
446
	u64 cur_offset = start;
447
	u64 last_delalloc_end = 0;
448
	u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
449
	bool checked_extent_shared = false;
450
	int ret;
451

452
	/*
453
	 * There can be no delalloc past i_size, so don't waste time looking for
454
	 * it beyond i_size.
455
	 */
456
	while (cur_offset < end && cur_offset < i_size) {
457
		u64 delalloc_start;
458
		u64 delalloc_end;
459
		u64 prealloc_start;
460
		u64 prealloc_len = 0;
461
		bool delalloc;
462

463
		delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
464
							delalloc_cached_state,
465
							&delalloc_start,
466
							&delalloc_end);
467
		if (!delalloc)
468
			break;
469

470
		/*
471
		 * If this is a prealloc extent we have to report every section
472
		 * of it that has no delalloc.
473
		 */
474
		if (disk_bytenr != 0) {
475
			if (last_delalloc_end == 0) {
476
				prealloc_start = start;
477
				prealloc_len = delalloc_start - start;
478
			} else {
479
				prealloc_start = last_delalloc_end + 1;
480
				prealloc_len = delalloc_start - prealloc_start;
481
			}
482
		}
483

484
		if (prealloc_len > 0) {
485
			if (!checked_extent_shared && fieinfo->fi_extents_max) {
486
				ret = btrfs_is_data_extent_shared(inode,
487
								  disk_bytenr,
488
								  extent_gen,
489
								  backref_ctx);
490
				if (ret < 0)
491
					return ret;
492
				else if (ret > 0)
493
					prealloc_flags |= FIEMAP_EXTENT_SHARED;
494

495
				checked_extent_shared = true;
496
			}
497
			ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
498
						 disk_bytenr + extent_offset,
499
						 prealloc_len, prealloc_flags);
500
			if (ret)
501
				return ret;
502
			extent_offset += prealloc_len;
503
		}
504

505
		ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
506
					 delalloc_end + 1 - delalloc_start,
507
					 FIEMAP_EXTENT_DELALLOC |
508
					 FIEMAP_EXTENT_UNKNOWN);
509
		if (ret)
510
			return ret;
511

512
		last_delalloc_end = delalloc_end;
513
		cur_offset = delalloc_end + 1;
514
		extent_offset += cur_offset - delalloc_start;
515
		cond_resched();
516
	}
517

518
	/*
519
	 * Either we found no delalloc for the whole prealloc extent or we have
520
	 * a prealloc extent that spans i_size or starts at or after i_size.
521
	 */
522
	if (disk_bytenr != 0 && last_delalloc_end < end) {
523
		u64 prealloc_start;
524
		u64 prealloc_len;
525

526
		if (last_delalloc_end == 0) {
527
			prealloc_start = start;
528
			prealloc_len = end + 1 - start;
529
		} else {
530
			prealloc_start = last_delalloc_end + 1;
531
			prealloc_len = end + 1 - prealloc_start;
532
		}
533

534
		if (!checked_extent_shared && fieinfo->fi_extents_max) {
535
			ret = btrfs_is_data_extent_shared(inode,
536
							  disk_bytenr,
537
							  extent_gen,
538
							  backref_ctx);
539
			if (ret < 0)
540
				return ret;
541
			else if (ret > 0)
542
				prealloc_flags |= FIEMAP_EXTENT_SHARED;
543
		}
544
		ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
545
					 disk_bytenr + extent_offset,
546
					 prealloc_len, prealloc_flags);
547
		if (ret)
548
			return ret;
549
	}
550

551
	return 0;
552
}
553

554
static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
555
					  struct btrfs_path *path,
556
					  u64 *last_extent_end_ret)
557
{
558
	const u64 ino = btrfs_ino(inode);
559
	struct btrfs_root *root = inode->root;
560
	struct extent_buffer *leaf;
561
	struct btrfs_file_extent_item *ei;
562
	struct btrfs_key key;
563
	u64 disk_bytenr;
564
	int ret;
565

566
	/*
567
	 * Lookup the last file extent. We're not using i_size here because
568
	 * there might be preallocation past i_size.
569
	 */
570
	ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
571
	/* There can't be a file extent item at offset (u64)-1 */
572
	ASSERT(ret != 0);
573
	if (ret < 0)
574
		return ret;
575

576
	/*
577
	 * For a non-existing key, btrfs_search_slot() always leaves us at a
578
	 * slot > 0, except if the btree is empty, which is impossible because
579
	 * at least it has the inode item for this inode and all the items for
580
	 * the root inode 256.
581
	 */
582
	ASSERT(path->slots[0] > 0);
583
	path->slots[0]--;
584
	leaf = path->nodes[0];
585
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
586
	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
587
		/* No file extent items in the subvolume tree. */
588
		*last_extent_end_ret = 0;
589
		return 0;
590
	}
591

592
	/*
593
	 * For an inline extent, the disk_bytenr is where inline data starts at,
594
	 * so first check if we have an inline extent item before checking if we
595
	 * have an implicit hole (disk_bytenr == 0).
596
	 */
597
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
598
	if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
599
		*last_extent_end_ret = btrfs_file_extent_end(path);
600
		return 0;
601
	}
602

603
	/*
604
	 * Find the last file extent item that is not a hole (when NO_HOLES is
605
	 * not enabled). This should take at most 2 iterations in the worst
606
	 * case: we have one hole file extent item at slot 0 of a leaf and
607
	 * another hole file extent item as the last item in the previous leaf.
608
	 * This is because we merge file extent items that represent holes.
609
	 */
610
	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
611
	while (disk_bytenr == 0) {
612
		ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
613
		if (ret < 0) {
614
			return ret;
615
		} else if (ret > 0) {
616
			/* No file extent items that are not holes. */
617
			*last_extent_end_ret = 0;
618
			return 0;
619
		}
620
		leaf = path->nodes[0];
621
		ei = btrfs_item_ptr(leaf, path->slots[0],
622
				    struct btrfs_file_extent_item);
623
		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
624
	}
625

626
	*last_extent_end_ret = btrfs_file_extent_end(path);
627
	return 0;
628
}
629

630
static int extent_fiemap(struct btrfs_inode *inode,
631
			 struct fiemap_extent_info *fieinfo,
632
			 u64 start, u64 len)
633
{
634
	const u64 ino = btrfs_ino(inode);
635
	struct extent_state *cached_state = NULL;
636
	struct extent_state *delalloc_cached_state = NULL;
637
	BTRFS_PATH_AUTO_FREE(path);
638
	struct fiemap_cache cache = { 0 };
639
	struct btrfs_backref_share_check_ctx *backref_ctx;
640
	u64 last_extent_end = 0;
641
	u64 prev_extent_end;
642
	u64 range_start;
643
	u64 range_end;
644
	const u64 sectorsize = inode->root->fs_info->sectorsize;
645
	bool stopped = false;
646
	int ret;
647

648
	cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
649
	cache.entries = kmalloc_array(cache.entries_size,
650
				      sizeof(struct btrfs_fiemap_entry),
651
				      GFP_KERNEL);
652
	backref_ctx = btrfs_alloc_backref_share_check_ctx();
653
	path = btrfs_alloc_path();
654
	if (!cache.entries || !backref_ctx || !path) {
655
		ret = -ENOMEM;
656
		goto out;
657
	}
658

659
restart:
660
	range_start = round_down(start, sectorsize);
661
	range_end = round_up(start + len, sectorsize);
662
	prev_extent_end = range_start;
663

664
	btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
665

666
	ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
667
	if (ret < 0)
668
		goto out_unlock;
669
	btrfs_release_path(path);
670

671
	path->reada = READA_FORWARD;
672
	ret = fiemap_search_slot(inode, path, range_start);
673
	if (ret < 0) {
674
		goto out_unlock;
675
	} else if (ret > 0) {
676
		/*
677
		 * No file extent item found, but we may have delalloc between
678
		 * the current offset and i_size. So check for that.
679
		 */
680
		ret = 0;
681
		goto check_eof_delalloc;
682
	}
683

684
	while (prev_extent_end < range_end) {
685
		struct extent_buffer *leaf = path->nodes[0];
686
		struct btrfs_file_extent_item *ei;
687
		struct btrfs_key key;
688
		u64 extent_end;
689
		u64 extent_len;
690
		u64 extent_offset = 0;
691
		u64 extent_gen;
692
		u64 disk_bytenr = 0;
693
		u64 flags = 0;
694
		int extent_type;
695
		u8 compression;
696

697
		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
698
		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
699
			break;
700

701
		extent_end = btrfs_file_extent_end(path);
702

703
		/*
704
		 * The first iteration can leave us at an extent item that ends
705
		 * before our range's start. Move to the next item.
706
		 */
707
		if (extent_end <= range_start)
708
			goto next_item;
709

710
		backref_ctx->curr_leaf_bytenr = leaf->start;
711

712
		/* We have in implicit hole (NO_HOLES feature enabled). */
713
		if (prev_extent_end < key.offset) {
714
			const u64 hole_end = min(key.offset, range_end) - 1;
715

716
			ret = fiemap_process_hole(inode, fieinfo, &cache,
717
						  &delalloc_cached_state,
718
						  backref_ctx, 0, 0, 0,
719
						  prev_extent_end, hole_end);
720
			if (ret < 0) {
721
				goto out_unlock;
722
			} else if (ret > 0) {
723
				/* fiemap_fill_next_extent() told us to stop. */
724
				stopped = true;
725
				break;
726
			}
727

728
			/* We've reached the end of the fiemap range, stop. */
729
			if (key.offset >= range_end) {
730
				stopped = true;
731
				break;
732
			}
733
		}
734

735
		extent_len = extent_end - key.offset;
736
		ei = btrfs_item_ptr(leaf, path->slots[0],
737
				    struct btrfs_file_extent_item);
738
		compression = btrfs_file_extent_compression(leaf, ei);
739
		extent_type = btrfs_file_extent_type(leaf, ei);
740
		extent_gen = btrfs_file_extent_generation(leaf, ei);
741

742
		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
743
			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
744
			if (compression == BTRFS_COMPRESS_NONE)
745
				extent_offset = btrfs_file_extent_offset(leaf, ei);
746
		}
747

748
		if (compression != BTRFS_COMPRESS_NONE)
749
			flags |= FIEMAP_EXTENT_ENCODED;
750

751
		if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
752
			flags |= FIEMAP_EXTENT_DATA_INLINE;
753
			flags |= FIEMAP_EXTENT_NOT_ALIGNED;
754
			ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
755
						 extent_len, flags);
756
		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
757
			ret = fiemap_process_hole(inode, fieinfo, &cache,
758
						  &delalloc_cached_state,
759
						  backref_ctx,
760
						  disk_bytenr, extent_offset,
761
						  extent_gen, key.offset,
762
						  extent_end - 1);
763
		} else if (disk_bytenr == 0) {
764
			/* We have an explicit hole. */
765
			ret = fiemap_process_hole(inode, fieinfo, &cache,
766
						  &delalloc_cached_state,
767
						  backref_ctx, 0, 0, 0,
768
						  key.offset, extent_end - 1);
769
		} else {
770
			/* We have a regular extent. */
771
			if (fieinfo->fi_extents_max) {
772
				ret = btrfs_is_data_extent_shared(inode,
773
								  disk_bytenr,
774
								  extent_gen,
775
								  backref_ctx);
776
				if (ret < 0)
777
					goto out_unlock;
778
				else if (ret > 0)
779
					flags |= FIEMAP_EXTENT_SHARED;
780
			}
781

782
			ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
783
						 disk_bytenr + extent_offset,
784
						 extent_len, flags);
785
		}
786

787
		if (ret < 0) {
788
			goto out_unlock;
789
		} else if (ret > 0) {
790
			/* emit_fiemap_extent() told us to stop. */
791
			stopped = true;
792
			break;
793
		}
794

795
		prev_extent_end = extent_end;
796
next_item:
797
		if (fatal_signal_pending(current)) {
798
			ret = -EINTR;
799
			goto out_unlock;
800
		}
801

802
		ret = fiemap_next_leaf_item(inode, path);
803
		if (ret < 0) {
804
			goto out_unlock;
805
		} else if (ret > 0) {
806
			/* No more file extent items for this inode. */
807
			break;
808
		}
809
		cond_resched();
810
	}
811

812
check_eof_delalloc:
813
	if (!stopped && prev_extent_end < range_end) {
814
		ret = fiemap_process_hole(inode, fieinfo, &cache,
815
					  &delalloc_cached_state, backref_ctx,
816
					  0, 0, 0, prev_extent_end, range_end - 1);
817
		if (ret < 0)
818
			goto out_unlock;
819
		prev_extent_end = range_end;
820
	}
821

822
	if (cache.cached && cache.offset + cache.len >= last_extent_end) {
823
		const u64 i_size = i_size_read(&inode->vfs_inode);
824

825
		if (prev_extent_end < i_size) {
826
			u64 delalloc_start;
827
			u64 delalloc_end;
828
			bool delalloc;
829

830
			delalloc = btrfs_find_delalloc_in_range(inode,
831
								prev_extent_end,
832
								i_size - 1,
833
								&delalloc_cached_state,
834
								&delalloc_start,
835
								&delalloc_end);
836
			if (!delalloc)
837
				cache.flags |= FIEMAP_EXTENT_LAST;
838
		} else {
839
			cache.flags |= FIEMAP_EXTENT_LAST;
840
		}
841
	}
842

843
out_unlock:
844
	btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
845

846
	if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
847
		btrfs_release_path(path);
848
		ret = flush_fiemap_cache(fieinfo, &cache);
849
		if (ret)
850
			goto out;
851
		len -= cache.next_search_offset - start;
852
		start = cache.next_search_offset;
853
		goto restart;
854
	} else if (ret < 0) {
855
		goto out;
856
	}
857

858
	/*
859
	 * Must free the path before emitting to the fiemap buffer because we
860
	 * may have a non-cloned leaf and if the fiemap buffer is memory mapped
861
	 * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
862
	 * waiting for an ordered extent that in order to complete needs to
863
	 * modify that leaf, therefore leading to a deadlock.
864
	 */
865
	btrfs_free_path(path);
866
	path = NULL;
867

868
	ret = flush_fiemap_cache(fieinfo, &cache);
869
	if (ret)
870
		goto out;
871

872
	ret = emit_last_fiemap_cache(fieinfo, &cache);
873
out:
874
	btrfs_free_extent_state(delalloc_cached_state);
875
	kfree(cache.entries);
876
	btrfs_free_backref_share_ctx(backref_ctx);
877
	return ret;
878
}
879

880
int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
881
		 u64 start, u64 len)
882
{
883
	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
884
	int ret;
885

886
	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
887
	if (ret)
888
		return ret;
889

890
	/*
891
	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
892
	 * file range (0 to LLONG_MAX), but that is not enough if we have
893
	 * compression enabled. The first filemap_fdatawrite_range() only kicks
894
	 * in the compression of data (in an async thread) and will return
895
	 * before the compression is done and writeback is started. A second
896
	 * filemap_fdatawrite_range() is needed to wait for the compression to
897
	 * complete and writeback to start. We also need to wait for ordered
898
	 * extents to complete, because our fiemap implementation uses mainly
899
	 * file extent items to list the extents, searching for extent maps
900
	 * only for file ranges with holes or prealloc extents to figure out
901
	 * if we have delalloc in those ranges.
902
	 */
903
	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
904
		ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
905
		if (ret)
906
			return ret;
907
	}
908

909
	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
910

911
	/*
912
	 * We did an initial flush to avoid holding the inode's lock while
913
	 * triggering writeback and waiting for the completion of IO and ordered
914
	 * extents. Now after we locked the inode we do it again, because it's
915
	 * possible a new write may have happened in between those two steps.
916
	 */
917
	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
918
		ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
919
		if (ret) {
920
			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
921
			return ret;
922
		}
923
	}
924

925
	ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
926
	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
927

928
	return ret;
929
}
930

931
Product

Resources

Company