Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/abd.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
24
* Copyright (c) 2019 by Delphix. All rights reserved.
25
*/
26
27
/*
28
* ARC buffer data (ABD).
29
*
30
* ABDs are an abstract data structure for the ARC which can use two
31
* different ways of storing the underlying data:
32
*
33
* (a) Linear buffer. In this case, all the data in the ABD is stored in one
34
* contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
35
*
36
* +-------------------+
37
* | ABD (linear) |
38
* | abd_flags = ... |
39
* | abd_size = ... | +--------------------------------+
40
* | abd_buf ------------->| raw buffer of size abd_size |
41
* +-------------------+ +--------------------------------+
42
* no abd_chunks
43
*
44
* (b) Scattered buffer. In this case, the data in the ABD is split into
45
* equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
46
* to the chunks recorded in an array at the end of the ABD structure.
47
*
48
* +-------------------+
49
* | ABD (scattered) |
50
* | abd_flags = ... |
51
* | abd_size = ... |
52
* | abd_offset = 0 | +-----------+
53
* | abd_chunks[0] ----------------------------->| chunk 0 |
54
* | abd_chunks[1] ---------------------+ +-----------+
55
* | ... | | +-----------+
56
* | abd_chunks[N-1] ---------+ +------->| chunk 1 |
57
* +-------------------+ | +-----------+
58
* | ...
59
* | +-----------+
60
* +----------------->| chunk N-1 |
61
* +-----------+
62
*
63
* In addition to directly allocating a linear or scattered ABD, it is also
64
* possible to create an ABD by requesting the "sub-ABD" starting at an offset
65
* within an existing ABD. In linear buffers this is simple (set abd_buf of
66
* the new ABD to the starting point within the original raw buffer), but
67
* scattered ABDs are a little more complex. The new ABD makes a copy of the
68
* relevant abd_chunks pointers (but not the underlying data). However, to
69
* provide arbitrary rather than only chunk-aligned starting offsets, it also
70
* tracks an abd_offset field which represents the starting point of the data
71
* within the first chunk in abd_chunks. For both linear and scattered ABDs,
72
* creating an offset ABD marks the original ABD as the offset's parent, and the
73
* original ABD's abd_children refcount is incremented. This data allows us to
74
* ensure the root ABD isn't deleted before its children.
75
*
76
* Most consumers should never need to know what type of ABD they're using --
77
* the ABD public API ensures that it's possible to transparently switch from
78
* using a linear ABD to a scattered one when doing so would be beneficial.
79
*
80
* If you need to use the data within an ABD directly, if you know it's linear
81
* (because you allocated it) you can use abd_to_buf() to access the underlying
82
* raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
83
* which will allocate a raw buffer if necessary. Use the abd_return_buf*
84
* functions to return any raw buffers that are no longer necessary when you're
85
* done using them.
86
*
87
* There are a variety of ABD APIs that implement basic buffer operations:
88
* compare, copy, read, write, and fill with zeroes. If you need a custom
89
* function which progressively accesses the whole ABD, use the abd_iterate_*
90
* functions.
91
*
92
* As an additional feature, linear and scatter ABD's can be stitched together
93
* by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs
94
* to be viewed as a singular ABD.
95
*
96
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
97
* B_FALSE.
98
*/
99
100
#include <sys/abd_impl.h>
101
#include <sys/param.h>
102
#include <sys/zio.h>
103
#include <sys/zfs_context.h>
104
#include <sys/zfs_znode.h>
105
106
/* see block comment above for description */
107
int zfs_abd_scatter_enabled = B_TRUE;
108
109
void
110
abd_verify(abd_t *abd)
111
{
112
#ifdef ZFS_DEBUG
113
if (abd_is_from_pages(abd)) {
114
ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS);
115
} else {
116
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
117
}
118
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
119
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
120
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
121
ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES));
122
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
123
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
124
if (abd_is_linear(abd)) {
125
ASSERT3U(abd->abd_size, >, 0);
126
ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
127
} else if (abd_is_gang(abd)) {
128
uint_t child_sizes = 0;
129
for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
130
cabd != NULL;
131
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
132
ASSERT(list_link_active(&cabd->abd_gang_link));
133
child_sizes += cabd->abd_size;
134
abd_verify(cabd);
135
}
136
ASSERT3U(abd->abd_size, ==, child_sizes);
137
} else {
138
ASSERT3U(abd->abd_size, >, 0);
139
abd_verify_scatter(abd);
140
}
141
#endif
142
}
143
144
void
145
abd_init_struct(abd_t *abd)
146
{
147
list_link_init(&abd->abd_gang_link);
148
mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
149
abd->abd_flags = 0;
150
#ifdef ZFS_DEBUG
151
zfs_refcount_create(&abd->abd_children);
152
abd->abd_parent = NULL;
153
#endif
154
abd->abd_size = 0;
155
}
156
157
static void
158
abd_fini_struct(abd_t *abd)
159
{
160
mutex_destroy(&abd->abd_mtx);
161
ASSERT(!list_link_active(&abd->abd_gang_link));
162
#ifdef ZFS_DEBUG
163
zfs_refcount_destroy(&abd->abd_children);
164
#endif
165
}
166
167
abd_t *
168
abd_alloc_struct(size_t size)
169
{
170
abd_t *abd = abd_alloc_struct_impl(size);
171
abd_init_struct(abd);
172
abd->abd_flags |= ABD_FLAG_ALLOCD;
173
return (abd);
174
}
175
176
void
177
abd_free_struct(abd_t *abd)
178
{
179
abd_fini_struct(abd);
180
abd_free_struct_impl(abd);
181
}
182
183
/*
184
* Allocate an ABD, along with its own underlying data buffers. Use this if you
185
* don't care whether the ABD is linear or not.
186
*/
187
abd_t *
188
abd_alloc(size_t size, boolean_t is_metadata)
189
{
190
if (abd_size_alloc_linear(size))
191
return (abd_alloc_linear(size, is_metadata));
192
193
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
194
195
abd_t *abd = abd_alloc_struct(size);
196
abd->abd_flags |= ABD_FLAG_OWNER;
197
abd->abd_u.abd_scatter.abd_offset = 0;
198
abd_alloc_chunks(abd, size);
199
200
if (is_metadata) {
201
abd->abd_flags |= ABD_FLAG_META;
202
}
203
abd->abd_size = size;
204
205
abd_update_scatter_stats(abd, ABDSTAT_INCR);
206
207
return (abd);
208
}
209
210
/*
211
* Allocate an ABD that must be linear, along with its own underlying data
212
* buffer. Only use this when it would be very annoying to write your ABD
213
* consumer with a scattered ABD.
214
*/
215
abd_t *
216
abd_alloc_linear(size_t size, boolean_t is_metadata)
217
{
218
abd_t *abd = abd_alloc_struct(0);
219
220
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
221
222
abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
223
if (is_metadata) {
224
abd->abd_flags |= ABD_FLAG_META;
225
}
226
abd->abd_size = size;
227
228
if (is_metadata) {
229
ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);
230
} else {
231
ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size);
232
}
233
234
abd_update_linear_stats(abd, ABDSTAT_INCR);
235
236
return (abd);
237
}
238
239
static void
240
abd_free_linear(abd_t *abd)
241
{
242
if (abd_is_linear_page(abd)) {
243
abd_free_linear_page(abd);
244
return;
245
}
246
247
if (abd->abd_flags & ABD_FLAG_META) {
248
zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
249
} else {
250
zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
251
}
252
253
abd_update_linear_stats(abd, ABDSTAT_DECR);
254
}
255
256
static void
257
abd_free_gang(abd_t *abd)
258
{
259
ASSERT(abd_is_gang(abd));
260
abd_t *cabd;
261
262
while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) {
263
/*
264
* We must acquire the child ABDs mutex to ensure that if it
265
* is being added to another gang ABD we will set the link
266
* as inactive when removing it from this gang ABD and before
267
* adding it to the other gang ABD.
268
*/
269
mutex_enter(&cabd->abd_mtx);
270
ASSERT(list_link_active(&cabd->abd_gang_link));
271
list_remove(&ABD_GANG(abd).abd_gang_chain, cabd);
272
mutex_exit(&cabd->abd_mtx);
273
if (cabd->abd_flags & ABD_FLAG_GANG_FREE)
274
abd_free(cabd);
275
}
276
list_destroy(&ABD_GANG(abd).abd_gang_chain);
277
}
278
279
static void
280
abd_free_scatter(abd_t *abd)
281
{
282
abd_free_chunks(abd);
283
abd_update_scatter_stats(abd, ABDSTAT_DECR);
284
}
285
286
/*
287
* Free an ABD. Use with any kind of abd: those created with abd_alloc_*()
288
* and abd_get_*(), including abd_get_offset_struct().
289
*
290
* If the ABD was created with abd_alloc_*(), the underlying data
291
* (scatterlist or linear buffer) will also be freed. (Subject to ownership
292
* changes via abd_*_ownership_of_buf().)
293
*
294
* Unless the ABD was created with abd_get_offset_struct(), the abd_t will
295
* also be freed.
296
*/
297
void
298
abd_free(abd_t *abd)
299
{
300
if (abd == NULL)
301
return;
302
303
abd_verify(abd);
304
#ifdef ZFS_DEBUG
305
IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL);
306
#endif
307
308
if (abd_is_gang(abd)) {
309
abd_free_gang(abd);
310
} else if (abd_is_linear(abd)) {
311
if (abd->abd_flags & ABD_FLAG_OWNER)
312
abd_free_linear(abd);
313
} else {
314
if (abd->abd_flags & ABD_FLAG_OWNER)
315
abd_free_scatter(abd);
316
}
317
318
#ifdef ZFS_DEBUG
319
if (abd->abd_parent != NULL) {
320
(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
321
abd->abd_size, abd);
322
}
323
#endif
324
325
abd_fini_struct(abd);
326
if (abd->abd_flags & ABD_FLAG_ALLOCD)
327
abd_free_struct_impl(abd);
328
}
329
330
/*
331
* Allocate an ABD of the same format (same metadata flag, same scatterize
332
* setting) as another ABD.
333
*/
334
abd_t *
335
abd_alloc_sametype(abd_t *sabd, size_t size)
336
{
337
boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
338
if (abd_is_linear(sabd) &&
339
!abd_is_linear_page(sabd)) {
340
return (abd_alloc_linear(size, is_metadata));
341
} else {
342
return (abd_alloc(size, is_metadata));
343
}
344
}
345
346
/*
347
* Create gang ABD that will be the head of a list of ABD's. This is used
348
* to "chain" scatter/gather lists together when constructing aggregated
349
* IO's. To free this abd, abd_free() must be called.
350
*/
351
abd_t *
352
abd_alloc_gang(void)
353
{
354
abd_t *abd = abd_alloc_struct(0);
355
abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER;
356
list_create(&ABD_GANG(abd).abd_gang_chain,
357
sizeof (abd_t), offsetof(abd_t, abd_gang_link));
358
return (abd);
359
}
360
361
/*
362
* Add a child gang ABD to a parent gang ABDs chained list.
363
*/
364
static void
365
abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
366
{
367
ASSERT(abd_is_gang(pabd));
368
ASSERT(abd_is_gang(cabd));
369
370
if (free_on_free) {
371
/*
372
* If the parent is responsible for freeing the child gang
373
* ABD we will just splice the child's children ABD list to
374
* the parent's list and immediately free the child gang ABD
375
* struct. The parent gang ABDs children from the child gang
376
* will retain all the free_on_free settings after being
377
* added to the parents list.
378
*/
379
#ifdef ZFS_DEBUG
380
/*
381
* If cabd had abd_parent, we have to drop it here. We can't
382
* transfer it to pabd, nor we can clear abd_size leaving it.
383
*/
384
if (cabd->abd_parent != NULL) {
385
(void) zfs_refcount_remove_many(
386
&cabd->abd_parent->abd_children,
387
cabd->abd_size, cabd);
388
cabd->abd_parent = NULL;
389
}
390
#endif
391
pabd->abd_size += cabd->abd_size;
392
cabd->abd_size = 0;
393
list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
394
&ABD_GANG(cabd).abd_gang_chain);
395
ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
396
abd_verify(pabd);
397
abd_free(cabd);
398
} else {
399
for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain);
400
child != NULL;
401
child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) {
402
/*
403
* We always pass B_FALSE for free_on_free as it is the
404
* original child gang ABDs responsibility to determine
405
* if any of its child ABDs should be free'd on the call
406
* to abd_free().
407
*/
408
abd_gang_add(pabd, child, B_FALSE);
409
}
410
abd_verify(pabd);
411
}
412
}
413
414
/*
415
* Add a child ABD to a gang ABD's chained list.
416
*/
417
void
418
abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
419
{
420
ASSERT(abd_is_gang(pabd));
421
abd_t *child_abd = NULL;
422
423
/*
424
* If the child being added is a gang ABD, we will add the
425
* child's ABDs to the parent gang ABD. This allows us to account
426
* for the offset correctly in the parent gang ABD.
427
*/
428
if (abd_is_gang(cabd)) {
429
ASSERT(!list_link_active(&cabd->abd_gang_link));
430
return (abd_gang_add_gang(pabd, cabd, free_on_free));
431
}
432
ASSERT(!abd_is_gang(cabd));
433
434
/*
435
* In order to verify that an ABD is not already part of
436
* another gang ABD, we must lock the child ABD's abd_mtx
437
* to check its abd_gang_link status. We unlock the abd_mtx
438
* only after it is has been added to a gang ABD, which
439
* will update the abd_gang_link's status. See comment below
440
* for how an ABD can be in multiple gang ABD's simultaneously.
441
*/
442
mutex_enter(&cabd->abd_mtx);
443
if (list_link_active(&cabd->abd_gang_link)) {
444
/*
445
* If the child ABD is already part of another
446
* gang ABD then we must allocate a new
447
* ABD to use a separate link. We mark the newly
448
* allocated ABD with ABD_FLAG_GANG_FREE, before
449
* adding it to the gang ABD's list, to make the
450
* gang ABD aware that it is responsible to call
451
* abd_free(). We use abd_get_offset() in order
452
* to just allocate a new ABD but avoid copying the
453
* data over into the newly allocated ABD.
454
*
455
* An ABD may become part of multiple gang ABD's. For
456
* example, when writing ditto bocks, the same ABD
457
* is used to write 2 or 3 locations with 2 or 3
458
* zio_t's. Each of the zio's may be aggregated with
459
* different adjacent zio's. zio aggregation uses gang
460
* zio's, so the single ABD can become part of multiple
461
* gang zio's.
462
*
463
* The ASSERT below is to make sure that if
464
* free_on_free is passed as B_TRUE, the ABD can
465
* not be in multiple gang ABD's. The gang ABD
466
* can not be responsible for cleaning up the child
467
* ABD memory allocation if the ABD can be in
468
* multiple gang ABD's at one time.
469
*/
470
ASSERT3B(free_on_free, ==, B_FALSE);
471
child_abd = abd_get_offset(cabd, 0);
472
child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
473
} else {
474
child_abd = cabd;
475
if (free_on_free)
476
child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
477
}
478
ASSERT3P(child_abd, !=, NULL);
479
480
list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd);
481
mutex_exit(&cabd->abd_mtx);
482
pabd->abd_size += child_abd->abd_size;
483
}
484
485
/*
486
* Locate the ABD for the supplied offset in the gang ABD.
487
* Return a new offset relative to the returned ABD.
488
*/
489
abd_t *
490
abd_gang_get_offset(abd_t *abd, size_t *off)
491
{
492
abd_t *cabd;
493
494
ASSERT(abd_is_gang(abd));
495
ASSERT3U(*off, <, abd->abd_size);
496
for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL;
497
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
498
if (*off >= cabd->abd_size)
499
*off -= cabd->abd_size;
500
else
501
return (cabd);
502
}
503
VERIFY3P(cabd, !=, NULL);
504
return (cabd);
505
}
506
507
/*
508
* Allocate a new ABD, using the provided struct (if non-NULL, and if
509
* circumstances allow - otherwise allocate the struct). The returned ABD will
510
* point to offset off of sabd. It shares the underlying buffer data with sabd.
511
* Use abd_free() to free. sabd must not be freed while any derived ABDs exist.
512
*/
513
static abd_t *
514
abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
515
{
516
abd_verify(sabd);
517
ASSERT3U(off + size, <=, sabd->abd_size);
518
519
if (abd_is_linear(sabd)) {
520
if (abd == NULL)
521
abd = abd_alloc_struct(0);
522
/*
523
* Even if this buf is filesystem metadata, we only track that
524
* if we own the underlying data buffer, which is not true in
525
* this case. Therefore, we don't ever use ABD_FLAG_META here.
526
*/
527
abd->abd_flags |= ABD_FLAG_LINEAR;
528
529
/*
530
* User pages from Direct I/O requests may be in a single page
531
* (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag
532
* that here for abd. This is required because we have to be
533
* careful when borrowing the buffer from the ABD because we
534
* can not place user pages under write protection on Linux.
535
* See the comments in abd_os.c for abd_borrow_buf(),
536
* abd_borrow_buf_copy(), abd_return_buf() and
537
* abd_return_buf_copy().
538
*/
539
if (abd_is_from_pages(sabd)) {
540
abd->abd_flags |= ABD_FLAG_FROM_PAGES |
541
ABD_FLAG_LINEAR_PAGE;
542
}
543
544
ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
545
} else if (abd_is_gang(sabd)) {
546
size_t left = size;
547
if (abd == NULL) {
548
abd = abd_alloc_gang();
549
} else {
550
abd->abd_flags |= ABD_FLAG_GANG;
551
list_create(&ABD_GANG(abd).abd_gang_chain,
552
sizeof (abd_t), offsetof(abd_t, abd_gang_link));
553
}
554
555
abd->abd_flags &= ~ABD_FLAG_OWNER;
556
for (abd_t *cabd = abd_gang_get_offset(sabd, &off);
557
cabd != NULL && left > 0;
558
cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) {
559
int csize = MIN(left, cabd->abd_size - off);
560
561
abd_t *nabd = abd_get_offset_size(cabd, off, csize);
562
abd_gang_add(abd, nabd, B_TRUE);
563
left -= csize;
564
off = 0;
565
}
566
ASSERT0(left);
567
} else {
568
abd = abd_get_offset_scatter(abd, sabd, off, size);
569
}
570
571
ASSERT3P(abd, !=, NULL);
572
abd->abd_size = size;
573
#ifdef ZFS_DEBUG
574
abd->abd_parent = sabd;
575
(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
576
#endif
577
return (abd);
578
}
579
580
/*
581
* Like abd_get_offset_size(), but memory for the abd_t is provided by the
582
* caller. Using this routine can improve performance by avoiding the cost
583
* of allocating memory for the abd_t struct, and updating the abd stats.
584
* Usually, the provided abd is returned, but in some circumstances (FreeBSD,
585
* if sabd is scatter and size is more than 2 pages) a new abd_t may need to
586
* be allocated. Therefore callers should be careful to use the returned
587
* abd_t*.
588
*/
589
abd_t *
590
abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size)
591
{
592
abd_t *result;
593
abd_init_struct(abd);
594
result = abd_get_offset_impl(abd, sabd, off, size);
595
if (result != abd)
596
abd_fini_struct(abd);
597
return (result);
598
}
599
600
abd_t *
601
abd_get_offset(abd_t *sabd, size_t off)
602
{
603
size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
604
VERIFY3U(size, >, 0);
605
return (abd_get_offset_impl(NULL, sabd, off, size));
606
}
607
608
abd_t *
609
abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
610
{
611
ASSERT3U(off + size, <=, sabd->abd_size);
612
return (abd_get_offset_impl(NULL, sabd, off, size));
613
}
614
615
/*
616
* Return a size scatter ABD containing only zeros.
617
*/
618
abd_t *
619
abd_get_zeros(size_t size)
620
{
621
ASSERT3P(abd_zero_scatter, !=, NULL);
622
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
623
return (abd_get_offset_size(abd_zero_scatter, 0, size));
624
}
625
626
/*
627
* Create a linear ABD for an existing buf.
628
*/
629
static abd_t *
630
abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
631
{
632
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
633
634
/*
635
* Even if this buf is filesystem metadata, we only track that if we
636
* own the underlying data buffer, which is not true in this case.
637
* Therefore, we don't ever use ABD_FLAG_META here.
638
*/
639
abd->abd_flags |= ABD_FLAG_LINEAR;
640
abd->abd_size = size;
641
642
ABD_LINEAR_BUF(abd) = buf;
643
644
return (abd);
645
}
646
647
abd_t *
648
abd_get_from_buf(void *buf, size_t size)
649
{
650
abd_t *abd = abd_alloc_struct(0);
651
return (abd_get_from_buf_impl(abd, buf, size));
652
}
653
654
abd_t *
655
abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
656
{
657
abd_init_struct(abd);
658
return (abd_get_from_buf_impl(abd, buf, size));
659
}
660
661
/*
662
* Get the raw buffer associated with a linear ABD.
663
*/
664
void *
665
abd_to_buf(abd_t *abd)
666
{
667
ASSERT(abd_is_linear(abd));
668
abd_verify(abd);
669
return (ABD_LINEAR_BUF(abd));
670
}
671
672
void
673
abd_release_ownership_of_buf(abd_t *abd)
674
{
675
ASSERT(abd_is_linear(abd));
676
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
677
678
/*
679
* abd_free() needs to handle LINEAR_PAGE ABD's specially.
680
* Since that flag does not survive the
681
* abd_release_ownership_of_buf() -> abd_get_from_buf() ->
682
* abd_take_ownership_of_buf() sequence, we don't allow releasing
683
* these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
684
*/
685
ASSERT(!abd_is_linear_page(abd));
686
687
abd_verify(abd);
688
689
abd->abd_flags &= ~ABD_FLAG_OWNER;
690
/* Disable this flag since we no longer own the data buffer */
691
abd->abd_flags &= ~ABD_FLAG_META;
692
693
abd_update_linear_stats(abd, ABDSTAT_DECR);
694
}
695
696
697
/*
698
* Give this ABD ownership of the buffer that it's storing. Can only be used on
699
* linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
700
* with abd_alloc_linear() which subsequently released ownership of their buf
701
* with abd_release_ownership_of_buf().
702
*/
703
void
704
abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
705
{
706
ASSERT(abd_is_linear(abd));
707
ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
708
abd_verify(abd);
709
710
abd->abd_flags |= ABD_FLAG_OWNER;
711
if (is_metadata) {
712
abd->abd_flags |= ABD_FLAG_META;
713
}
714
715
abd_update_linear_stats(abd, ABDSTAT_INCR);
716
}
717
718
/*
719
* Initializes an abd_iter based on whether the abd is a gang ABD
720
* or just a single ABD.
721
*/
722
static inline abd_t *
723
abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off)
724
{
725
abd_t *cabd = NULL;
726
727
if (abd_is_gang(abd)) {
728
cabd = abd_gang_get_offset(abd, &off);
729
if (cabd) {
730
abd_iter_init(aiter, cabd);
731
abd_iter_advance(aiter, off);
732
}
733
} else {
734
abd_iter_init(aiter, abd);
735
abd_iter_advance(aiter, off);
736
}
737
return (cabd);
738
}
739
740
/*
741
* Advances an abd_iter. We have to be careful with gang ABD as
742
* advancing could mean that we are at the end of a particular ABD and
743
* must grab the ABD in the gang ABD's list.
744
*/
745
static inline abd_t *
746
abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter,
747
size_t len)
748
{
749
abd_iter_advance(aiter, len);
750
if (abd_is_gang(abd) && abd_iter_at_end(aiter)) {
751
ASSERT3P(cabd, !=, NULL);
752
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd);
753
if (cabd) {
754
abd_iter_init(aiter, cabd);
755
abd_iter_advance(aiter, 0);
756
}
757
}
758
return (cabd);
759
}
760
761
int
762
abd_iterate_func(abd_t *abd, size_t off, size_t size,
763
abd_iter_func_t *func, void *private)
764
{
765
struct abd_iter aiter;
766
int ret = 0;
767
768
if (size == 0)
769
return (0);
770
771
abd_verify(abd);
772
ASSERT3U(off + size, <=, abd->abd_size);
773
774
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
775
776
while (size > 0) {
777
IMPLY(abd_is_gang(abd), c_abd != NULL);
778
779
abd_iter_map(&aiter);
780
781
size_t len = MIN(aiter.iter_mapsize, size);
782
ASSERT3U(len, >, 0);
783
784
ret = func(aiter.iter_mapaddr, len, private);
785
786
abd_iter_unmap(&aiter);
787
788
if (ret != 0)
789
break;
790
791
size -= len;
792
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
793
}
794
795
return (ret);
796
}
797
798
#if defined(__linux__) && defined(_KERNEL)
799
int
800
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
801
abd_iter_page_func_t *func, void *private)
802
{
803
struct abd_iter aiter;
804
int ret = 0;
805
806
if (size == 0)
807
return (0);
808
809
abd_verify(abd);
810
ASSERT3U(off + size, <=, abd->abd_size);
811
812
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
813
814
while (size > 0) {
815
IMPLY(abd_is_gang(abd), c_abd != NULL);
816
817
abd_iter_page(&aiter);
818
819
size_t len = MIN(aiter.iter_page_dsize, size);
820
ASSERT3U(len, >, 0);
821
822
ret = func(aiter.iter_page, aiter.iter_page_doff,
823
len, private);
824
825
aiter.iter_page = NULL;
826
aiter.iter_page_doff = 0;
827
aiter.iter_page_dsize = 0;
828
829
if (ret != 0)
830
break;
831
832
size -= len;
833
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
834
}
835
836
return (ret);
837
}
838
#endif
839
840
struct buf_arg {
841
void *arg_buf;
842
};
843
844
static int
845
abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
846
{
847
struct buf_arg *ba_ptr = private;
848
849
(void) memcpy(ba_ptr->arg_buf, buf, size);
850
ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
851
852
return (0);
853
}
854
855
/*
856
* Copy abd to buf. (off is the offset in abd.)
857
*/
858
void
859
abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
860
{
861
struct buf_arg ba_ptr = { buf };
862
863
(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
864
&ba_ptr);
865
}
866
867
static int
868
abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
869
{
870
int ret;
871
struct buf_arg *ba_ptr = private;
872
873
ret = memcmp(buf, ba_ptr->arg_buf, size);
874
ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
875
876
return (ret);
877
}
878
879
/*
880
* Compare the contents of abd to buf. (off is the offset in abd.)
881
*/
882
int
883
abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
884
{
885
struct buf_arg ba_ptr = { (void *) buf };
886
887
return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
888
}
889
890
static int
891
abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
892
{
893
struct buf_arg *ba_ptr = private;
894
895
(void) memcpy(buf, ba_ptr->arg_buf, size);
896
ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
897
898
return (0);
899
}
900
901
/*
902
* Copy from buf to abd. (off is the offset in abd.)
903
*/
904
void
905
abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
906
{
907
struct buf_arg ba_ptr = { (void *) buf };
908
909
(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
910
&ba_ptr);
911
}
912
913
static int
914
abd_zero_off_cb(void *buf, size_t size, void *private)
915
{
916
(void) private;
917
(void) memset(buf, 0, size);
918
return (0);
919
}
920
921
/*
922
* Zero out the abd from a particular offset to the end.
923
*/
924
void
925
abd_zero_off(abd_t *abd, size_t off, size_t size)
926
{
927
(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
928
}
929
930
/*
931
* Iterate over two ABDs and call func incrementally on the two ABDs' data in
932
* equal-sized chunks (passed to func as raw buffers). func could be called many
933
* times during this iteration.
934
*/
935
int
936
abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
937
size_t size, abd_iter_func2_t *func, void *private)
938
{
939
int ret = 0;
940
struct abd_iter daiter, saiter;
941
abd_t *c_dabd, *c_sabd;
942
943
if (size == 0)
944
return (0);
945
946
abd_verify(dabd);
947
abd_verify(sabd);
948
949
ASSERT3U(doff + size, <=, dabd->abd_size);
950
ASSERT3U(soff + size, <=, sabd->abd_size);
951
952
c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
953
c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
954
955
while (size > 0) {
956
IMPLY(abd_is_gang(dabd), c_dabd != NULL);
957
IMPLY(abd_is_gang(sabd), c_sabd != NULL);
958
959
abd_iter_map(&daiter);
960
abd_iter_map(&saiter);
961
962
size_t dlen = MIN(daiter.iter_mapsize, size);
963
size_t slen = MIN(saiter.iter_mapsize, size);
964
size_t len = MIN(dlen, slen);
965
ASSERT(dlen > 0 || slen > 0);
966
967
ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
968
private);
969
970
abd_iter_unmap(&saiter);
971
abd_iter_unmap(&daiter);
972
973
if (ret != 0)
974
break;
975
976
size -= len;
977
c_dabd =
978
abd_advance_abd_iter(dabd, c_dabd, &daiter, len);
979
c_sabd =
980
abd_advance_abd_iter(sabd, c_sabd, &saiter, len);
981
}
982
983
return (ret);
984
}
985
986
static int
987
abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
988
{
989
(void) private;
990
(void) memcpy(dbuf, sbuf, size);
991
return (0);
992
}
993
994
/*
995
* Copy from sabd to dabd starting from soff and doff.
996
*/
997
void
998
abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
999
{
1000
(void) abd_iterate_func2(dabd, sabd, doff, soff, size,
1001
abd_copy_off_cb, NULL);
1002
}
1003
1004
static int
1005
abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
1006
{
1007
(void) private;
1008
return (memcmp(bufa, bufb, size));
1009
}
1010
1011
/*
1012
* Compares the contents of two ABDs.
1013
*/
1014
int
1015
abd_cmp(abd_t *dabd, abd_t *sabd)
1016
{
1017
ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
1018
return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
1019
abd_cmp_cb, NULL));
1020
}
1021
1022
/*
1023
* Check if ABD content is all-zeroes.
1024
*/
1025
static int
1026
abd_cmp_zero_off_cb(void *data, size_t len, void *private)
1027
{
1028
(void) private;
1029
1030
/* This function can only check whole uint64s. Enforce that. */
1031
ASSERT0(P2PHASE(len, 8));
1032
1033
uint64_t *end = (uint64_t *)((char *)data + len);
1034
for (uint64_t *word = (uint64_t *)data; word < end; word++)
1035
if (*word != 0)
1036
return (1);
1037
1038
return (0);
1039
}
1040
1041
int
1042
abd_cmp_zero_off(abd_t *abd, size_t off, size_t size)
1043
{
1044
return (abd_iterate_func(abd, off, size, abd_cmp_zero_off_cb, NULL));
1045
}
1046
1047
/*
1048
* Iterate over code ABDs and a data ABD and call @func_raidz_gen.
1049
*
1050
* @cabds parity ABDs, must have equal size
1051
* @dabd data ABD. Can be NULL (in this case @dsize = 0)
1052
* @func_raidz_gen should be implemented so that its behaviour
1053
* is the same when taking linear and when taking scatter
1054
*/
1055
void
1056
abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,
1057
size_t csize, size_t dsize, const unsigned parity,
1058
void (*func_raidz_gen)(void **, const void *, size_t, size_t))
1059
{
1060
int i;
1061
size_t len, dlen;
1062
struct abd_iter caiters[3];
1063
struct abd_iter daiter;
1064
void *caddrs[3], *daddr;
1065
unsigned long flags __maybe_unused = 0;
1066
abd_t *c_cabds[3];
1067
abd_t *c_dabd = NULL;
1068
1069
ASSERT3U(parity, <=, 3);
1070
for (i = 0; i < parity; i++) {
1071
abd_verify(cabds[i]);
1072
ASSERT3U(off + csize, <=, cabds[i]->abd_size);
1073
c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off);
1074
}
1075
1076
if (dsize > 0) {
1077
ASSERT(dabd);
1078
abd_verify(dabd);
1079
ASSERT3U(off + dsize, <=, dabd->abd_size);
1080
c_dabd = abd_init_abd_iter(dabd, &daiter, off);
1081
}
1082
1083
abd_enter_critical(flags);
1084
while (csize > 0) {
1085
len = csize;
1086
for (i = 0; i < parity; i++) {
1087
IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
1088
abd_iter_map(&caiters[i]);
1089
caddrs[i] = caiters[i].iter_mapaddr;
1090
len = MIN(caiters[i].iter_mapsize, len);
1091
}
1092
1093
if (dsize > 0) {
1094
IMPLY(abd_is_gang(dabd), c_dabd != NULL);
1095
abd_iter_map(&daiter);
1096
daddr = daiter.iter_mapaddr;
1097
len = MIN(daiter.iter_mapsize, len);
1098
dlen = len;
1099
} else {
1100
daddr = NULL;
1101
dlen = 0;
1102
}
1103
1104
/* must be progressive */
1105
ASSERT3U(len, >, 0);
1106
/*
1107
* The iterated function likely will not do well if each
1108
* segment except the last one is not multiple of 512 (raidz).
1109
*/
1110
ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1111
1112
func_raidz_gen(caddrs, daddr, len, dlen);
1113
1114
for (i = parity-1; i >= 0; i--) {
1115
abd_iter_unmap(&caiters[i]);
1116
c_cabds[i] =
1117
abd_advance_abd_iter(cabds[i], c_cabds[i],
1118
&caiters[i], len);
1119
}
1120
1121
if (dsize > 0) {
1122
abd_iter_unmap(&daiter);
1123
c_dabd =
1124
abd_advance_abd_iter(dabd, c_dabd, &daiter,
1125
dlen);
1126
dsize -= dlen;
1127
}
1128
1129
csize -= len;
1130
}
1131
abd_exit_critical(flags);
1132
}
1133
1134
/*
1135
* Iterate over code ABDs and data reconstruction target ABDs and call
1136
* @func_raidz_rec. Function maps at most 6 pages atomically.
1137
*
1138
* @cabds parity ABDs, must have equal size
1139
* @tabds rec target ABDs, at most 3
1140
* @tsize size of data target columns
1141
* @func_raidz_rec expects syndrome data in target columns. Function
1142
* reconstructs data and overwrites target columns.
1143
*/
1144
void
1145
abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
1146
size_t tsize, const unsigned parity,
1147
void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
1148
const unsigned *mul),
1149
const unsigned *mul)
1150
{
1151
int i;
1152
size_t len;
1153
struct abd_iter citers[3];
1154
struct abd_iter xiters[3];
1155
void *caddrs[3], *xaddrs[3];
1156
unsigned long flags __maybe_unused = 0;
1157
abd_t *c_cabds[3];
1158
abd_t *c_tabds[3];
1159
1160
ASSERT3U(parity, <=, 3);
1161
1162
for (i = 0; i < parity; i++) {
1163
abd_verify(cabds[i]);
1164
abd_verify(tabds[i]);
1165
ASSERT3U(tsize, <=, cabds[i]->abd_size);
1166
ASSERT3U(tsize, <=, tabds[i]->abd_size);
1167
c_cabds[i] =
1168
abd_init_abd_iter(cabds[i], &citers[i], 0);
1169
c_tabds[i] =
1170
abd_init_abd_iter(tabds[i], &xiters[i], 0);
1171
}
1172
1173
abd_enter_critical(flags);
1174
while (tsize > 0) {
1175
len = tsize;
1176
for (i = 0; i < parity; i++) {
1177
IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
1178
IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);
1179
abd_iter_map(&citers[i]);
1180
abd_iter_map(&xiters[i]);
1181
caddrs[i] = citers[i].iter_mapaddr;
1182
xaddrs[i] = xiters[i].iter_mapaddr;
1183
len = MIN(citers[i].iter_mapsize, len);
1184
len = MIN(xiters[i].iter_mapsize, len);
1185
}
1186
1187
/* must be progressive */
1188
ASSERT3S(len, >, 0);
1189
/*
1190
* The iterated function likely will not do well if each
1191
* segment except the last one is not multiple of 512 (raidz).
1192
*/
1193
ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1194
1195
func_raidz_rec(xaddrs, len, caddrs, mul);
1196
1197
for (i = parity-1; i >= 0; i--) {
1198
abd_iter_unmap(&xiters[i]);
1199
abd_iter_unmap(&citers[i]);
1200
c_tabds[i] =
1201
abd_advance_abd_iter(tabds[i], c_tabds[i],
1202
&xiters[i], len);
1203
c_cabds[i] =
1204
abd_advance_abd_iter(cabds[i], c_cabds[i],
1205
&citers[i], len);
1206
}
1207
1208
tsize -= len;
1209
ASSERT3S(tsize, >=, 0);
1210
}
1211
abd_exit_critical(flags);
1212
}
1213
1214
EXPORT_SYMBOL(abd_free);
1215
1216