Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24
* Use is subject to license terms.
25
*/
26
27
/*
28
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
29
*/
30
31
#include <sys/zfs_context.h>
32
#include <sys/arc_impl.h>
33
#include <sys/dnode.h>
34
#include <sys/dmu_objset.h>
35
#include <sys/dmu_zfetch.h>
36
#include <sys/dmu.h>
37
#include <sys/dbuf.h>
38
#include <sys/kstat.h>
39
#include <sys/wmsum.h>
40
41
/*
42
* This tunable disables predictive prefetch. Note that it leaves "prescient"
43
* prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
44
* prescient prefetch never issues i/os that end up not being needed,
45
* so it can't hurt performance.
46
*/
47
48
static int zfs_prefetch_disable = B_FALSE;
49
50
/* max # of streams per zfetch */
51
static unsigned int zfetch_max_streams = 8;
52
/* min time before stream reclaim */
53
static unsigned int zfetch_min_sec_reap = 1;
54
/* max time before stream delete */
55
static unsigned int zfetch_max_sec_reap = 2;
56
#ifdef _ILP32
57
/* min bytes to prefetch per stream (default 2MB) */
58
static unsigned int zfetch_min_distance = 2 * 1024 * 1024;
59
/* max bytes to prefetch per stream (default 8MB) */
60
unsigned int zfetch_max_distance = 8 * 1024 * 1024;
61
#else
62
/* min bytes to prefetch per stream (default 4MB) */
63
static unsigned int zfetch_min_distance = 4 * 1024 * 1024;
64
/* max bytes to prefetch per stream (default 64MB) */
65
unsigned int zfetch_max_distance = 64 * 1024 * 1024;
66
#endif
67
/* max bytes to prefetch indirects for per stream (default 128MB) */
68
unsigned int zfetch_max_idistance = 128 * 1024 * 1024;
69
/* max request reorder distance within a stream (default 16MB) */
70
unsigned int zfetch_max_reorder = 16 * 1024 * 1024;
71
/* Max log2 fraction of holes in a stream */
72
unsigned int zfetch_hole_shift = 2;
73
74
typedef struct zfetch_stats {
75
kstat_named_t zfetchstat_hits;
76
kstat_named_t zfetchstat_future;
77
kstat_named_t zfetchstat_stride;
78
kstat_named_t zfetchstat_past;
79
kstat_named_t zfetchstat_misses;
80
kstat_named_t zfetchstat_max_streams;
81
kstat_named_t zfetchstat_io_issued;
82
kstat_named_t zfetchstat_io_active;
83
} zfetch_stats_t;
84
85
static zfetch_stats_t zfetch_stats = {
86
{ "hits", KSTAT_DATA_UINT64 },
87
{ "future", KSTAT_DATA_UINT64 },
88
{ "stride", KSTAT_DATA_UINT64 },
89
{ "past", KSTAT_DATA_UINT64 },
90
{ "misses", KSTAT_DATA_UINT64 },
91
{ "max_streams", KSTAT_DATA_UINT64 },
92
{ "io_issued", KSTAT_DATA_UINT64 },
93
{ "io_active", KSTAT_DATA_UINT64 },
94
};
95
96
struct {
97
wmsum_t zfetchstat_hits;
98
wmsum_t zfetchstat_future;
99
wmsum_t zfetchstat_stride;
100
wmsum_t zfetchstat_past;
101
wmsum_t zfetchstat_misses;
102
wmsum_t zfetchstat_max_streams;
103
wmsum_t zfetchstat_io_issued;
104
aggsum_t zfetchstat_io_active;
105
} zfetch_sums;
106
107
#define ZFETCHSTAT_BUMP(stat) \
108
wmsum_add(&zfetch_sums.stat, 1)
109
#define ZFETCHSTAT_ADD(stat, val) \
110
wmsum_add(&zfetch_sums.stat, val)
111
112
113
static kstat_t *zfetch_ksp;
114
115
static int
116
zfetch_kstats_update(kstat_t *ksp, int rw)
117
{
118
zfetch_stats_t *zs = ksp->ks_data;
119
120
if (rw == KSTAT_WRITE)
121
return (EACCES);
122
zs->zfetchstat_hits.value.ui64 =
123
wmsum_value(&zfetch_sums.zfetchstat_hits);
124
zs->zfetchstat_future.value.ui64 =
125
wmsum_value(&zfetch_sums.zfetchstat_future);
126
zs->zfetchstat_stride.value.ui64 =
127
wmsum_value(&zfetch_sums.zfetchstat_stride);
128
zs->zfetchstat_past.value.ui64 =
129
wmsum_value(&zfetch_sums.zfetchstat_past);
130
zs->zfetchstat_misses.value.ui64 =
131
wmsum_value(&zfetch_sums.zfetchstat_misses);
132
zs->zfetchstat_max_streams.value.ui64 =
133
wmsum_value(&zfetch_sums.zfetchstat_max_streams);
134
zs->zfetchstat_io_issued.value.ui64 =
135
wmsum_value(&zfetch_sums.zfetchstat_io_issued);
136
zs->zfetchstat_io_active.value.ui64 =
137
aggsum_value(&zfetch_sums.zfetchstat_io_active);
138
return (0);
139
}
140
141
void
142
zfetch_init(void)
143
{
144
wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
145
wmsum_init(&zfetch_sums.zfetchstat_future, 0);
146
wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
147
wmsum_init(&zfetch_sums.zfetchstat_past, 0);
148
wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
149
wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
150
wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
151
aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);
152
153
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
154
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
155
KSTAT_FLAG_VIRTUAL);
156
157
if (zfetch_ksp != NULL) {
158
zfetch_ksp->ks_data = &zfetch_stats;
159
zfetch_ksp->ks_update = zfetch_kstats_update;
160
kstat_install(zfetch_ksp);
161
}
162
}
163
164
void
165
zfetch_fini(void)
166
{
167
if (zfetch_ksp != NULL) {
168
kstat_delete(zfetch_ksp);
169
zfetch_ksp = NULL;
170
}
171
172
wmsum_fini(&zfetch_sums.zfetchstat_hits);
173
wmsum_fini(&zfetch_sums.zfetchstat_future);
174
wmsum_fini(&zfetch_sums.zfetchstat_stride);
175
wmsum_fini(&zfetch_sums.zfetchstat_past);
176
wmsum_fini(&zfetch_sums.zfetchstat_misses);
177
wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
178
wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
179
ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active));
180
aggsum_fini(&zfetch_sums.zfetchstat_io_active);
181
}
182
183
/*
184
* This takes a pointer to a zfetch structure and a dnode. It performs the
185
* necessary setup for the zfetch structure, grokking data from the
186
* associated dnode.
187
*/
188
void
189
dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
190
{
191
if (zf == NULL)
192
return;
193
zf->zf_dnode = dno;
194
zf->zf_numstreams = 0;
195
196
list_create(&zf->zf_stream, sizeof (zstream_t),
197
offsetof(zstream_t, zs_node));
198
199
mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
200
}
201
202
static void
203
dmu_zfetch_stream_fini(zstream_t *zs)
204
{
205
ASSERT(!list_link_active(&zs->zs_node));
206
zfs_refcount_destroy(&zs->zs_callers);
207
zfs_refcount_destroy(&zs->zs_refs);
208
kmem_free(zs, sizeof (*zs));
209
}
210
211
static void
212
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
213
{
214
ASSERT(MUTEX_HELD(&zf->zf_lock));
215
list_remove(&zf->zf_stream, zs);
216
zf->zf_numstreams--;
217
membar_producer();
218
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
219
dmu_zfetch_stream_fini(zs);
220
}
221
222
/*
223
* Clean-up state associated with a zfetch structure (e.g. destroy the
224
* streams). This doesn't free the zfetch_t itself, that's left to the caller.
225
*/
226
void
227
dmu_zfetch_fini(zfetch_t *zf)
228
{
229
zstream_t *zs;
230
231
mutex_enter(&zf->zf_lock);
232
while ((zs = list_head(&zf->zf_stream)) != NULL)
233
dmu_zfetch_stream_remove(zf, zs);
234
mutex_exit(&zf->zf_lock);
235
list_destroy(&zf->zf_stream);
236
mutex_destroy(&zf->zf_lock);
237
238
zf->zf_dnode = NULL;
239
}
240
241
/*
242
* If there aren't too many active streams already, create one more.
243
* In process delete/reuse all streams without hits for zfetch_max_sec_reap.
244
* If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
245
* The "blkid" argument is the next block that we expect this stream to access.
246
*/
247
static void
248
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
249
{
250
zstream_t *zs, *zs_next, *zs_old = NULL;
251
uint_t now = gethrestime_sec(), t;
252
253
ASSERT(MUTEX_HELD(&zf->zf_lock));
254
255
/*
256
* Delete too old streams, reusing the first found one.
257
*/
258
t = now - zfetch_max_sec_reap;
259
for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
260
zs_next = list_next(&zf->zf_stream, zs);
261
/*
262
* Skip if still active. 1 -- zf_stream reference.
263
*/
264
if ((int)(zs->zs_atime - t) >= 0)
265
continue;
266
if (zfs_refcount_count(&zs->zs_refs) != 1)
267
continue;
268
if (zs_old)
269
dmu_zfetch_stream_remove(zf, zs);
270
else
271
zs_old = zs;
272
}
273
if (zs_old) {
274
zs = zs_old;
275
list_remove(&zf->zf_stream, zs);
276
goto reuse;
277
}
278
279
/*
280
* The maximum number of streams is normally zfetch_max_streams,
281
* but for small files we lower it such that it's at least possible
282
* for all the streams to be non-overlapping.
283
*/
284
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
285
(zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
286
zfetch_max_distance));
287
if (zf->zf_numstreams >= max_streams) {
288
t = now - zfetch_min_sec_reap;
289
for (zs = list_head(&zf->zf_stream); zs != NULL;
290
zs = list_next(&zf->zf_stream, zs)) {
291
if ((int)(zs->zs_atime - t) >= 0)
292
continue;
293
if (zfs_refcount_count(&zs->zs_refs) != 1)
294
continue;
295
if (zs_old == NULL ||
296
(int)(zs_old->zs_atime - zs->zs_atime) >= 0)
297
zs_old = zs;
298
}
299
if (zs_old) {
300
zs = zs_old;
301
list_remove(&zf->zf_stream, zs);
302
goto reuse;
303
}
304
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
305
return;
306
}
307
308
zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
309
zfs_refcount_create(&zs->zs_callers);
310
zfs_refcount_create(&zs->zs_refs);
311
/* One reference for zf_stream. */
312
zfs_refcount_add(&zs->zs_refs, NULL);
313
zf->zf_numstreams++;
314
315
reuse:
316
list_insert_head(&zf->zf_stream, zs);
317
zs->zs_blkid = blkid;
318
/* Allow immediate stream reuse until first hit. */
319
zs->zs_atime = now - zfetch_min_sec_reap;
320
memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
321
zs->zs_pf_dist = 0;
322
zs->zs_ipf_dist = 0;
323
zs->zs_pf_start = blkid;
324
zs->zs_pf_end = blkid;
325
zs->zs_ipf_start = blkid;
326
zs->zs_ipf_end = blkid;
327
zs->zs_missed = B_FALSE;
328
zs->zs_more = B_FALSE;
329
}
330
331
static void
332
dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
333
{
334
zstream_t *zs = arg;
335
336
if (io_issued && level == 0 && blkid < zs->zs_blkid)
337
zs->zs_more = B_TRUE;
338
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
339
dmu_zfetch_stream_fini(zs);
340
aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
341
}
342
343
/*
344
* Process stream hit access for nblks blocks starting at zs_blkid. Return
345
* number of blocks to proceed for after aggregation with future ranges.
346
*/
347
static uint64_t
348
dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
349
{
350
uint_t i, j;
351
352
/* Optimize sequential accesses (no future ranges). */
353
if (zs->zs_ranges[0].start == 0)
354
goto done;
355
356
/* Look for intersections with further ranges. */
357
for (i = 0; i < ZFETCH_RANGES; i++) {
358
zsrange_t *r = &zs->zs_ranges[i];
359
if (r->start == 0 || r->start > nblks)
360
break;
361
if (r->end >= nblks) {
362
nblks = r->end;
363
i++;
364
break;
365
}
366
}
367
368
/* Delete all found intersecting ranges, updates remaining. */
369
for (j = 0; i < ZFETCH_RANGES; i++, j++) {
370
if (zs->zs_ranges[i].start == 0)
371
break;
372
ASSERT3U(zs->zs_ranges[i].start, >, nblks);
373
ASSERT3U(zs->zs_ranges[i].end, >, nblks);
374
zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
375
zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
376
}
377
if (j < ZFETCH_RANGES) {
378
zs->zs_ranges[j].start = 0;
379
zs->zs_ranges[j].end = 0;
380
}
381
382
done:
383
zs->zs_blkid += nblks;
384
return (nblks);
385
}
386
387
/*
388
* Process future stream access for nblks blocks starting at blkid. Return
389
* number of blocks to proceed for if future ranges reach fill threshold.
390
*/
391
static uint64_t
392
dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
393
{
394
ASSERT3U(blkid, >, zs->zs_blkid);
395
blkid -= zs->zs_blkid;
396
ASSERT3U(blkid + nblks, <=, UINT16_MAX);
397
398
/* Search for first and last intersection or insert point. */
399
uint_t f = ZFETCH_RANGES, l = 0, i;
400
for (i = 0; i < ZFETCH_RANGES; i++) {
401
zsrange_t *r = &zs->zs_ranges[i];
402
if (r->start == 0 || r->start > blkid + nblks)
403
break;
404
if (r->end < blkid)
405
continue;
406
if (f > i)
407
f = i;
408
if (l < i)
409
l = i;
410
}
411
if (f <= l) {
412
/* Got some intersecting range, expand it if needed. */
413
if (zs->zs_ranges[f].start > blkid)
414
zs->zs_ranges[f].start = blkid;
415
zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
416
if (f < l) {
417
/* Got more than one intersection, remove others. */
418
for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
419
zs->zs_ranges[f].start = zs->zs_ranges[l].start;
420
zs->zs_ranges[f].end = zs->zs_ranges[l].end;
421
}
422
zs->zs_ranges[f].start = 0;
423
zs->zs_ranges[f].end = 0;
424
}
425
} else if (i < ZFETCH_RANGES) {
426
/* Got no intersecting ranges, insert new one. */
427
for (l = ZFETCH_RANGES - 1; l > i; l--) {
428
zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
429
zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
430
}
431
zs->zs_ranges[i].start = blkid;
432
zs->zs_ranges[i].end = blkid + nblks;
433
} else {
434
/* No space left to insert. Drop the range. */
435
return (0);
436
}
437
438
/* Check if with the new access addition we reached fill threshold. */
439
if (zfetch_hole_shift >= 16)
440
return (0);
441
uint_t hole = 0;
442
for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
443
zsrange_t *r = &zs->zs_ranges[i];
444
if (r->start == 0)
445
break;
446
hole += r->start - f;
447
f = r->end;
448
if (hole <= r->end >> zfetch_hole_shift)
449
l = r->end;
450
}
451
if (l > 0)
452
return (dmu_zfetch_hit(zs, l));
453
454
return (0);
455
}
456
457
/*
458
* This is the predictive prefetch entry point. dmu_zfetch_prepare()
459
* associates dnode access specified with blkid and nblks arguments with
460
* prefetch stream, predicts further accesses based on that stats and returns
461
* the stream pointer on success. That pointer must later be passed to
462
* dmu_zfetch_run() to initiate the speculative prefetch for the stream and
463
* release it. dmu_zfetch() is a wrapper for simple cases when window between
464
* prediction and prefetch initiation is not needed.
465
* fetch_data argument specifies whether actual data blocks should be fetched:
466
* FALSE -- prefetch only indirect blocks for predicted data blocks;
467
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
468
*/
469
zstream_t *
470
dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
471
boolean_t fetch_data, boolean_t have_lock)
472
{
473
zstream_t *zs;
474
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
475
zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch;
476
int64_t ipf_start, ipf_end;
477
478
if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE)
479
return (NULL);
480
481
if (os_prefetch == ZFS_PREFETCH_METADATA)
482
fetch_data = B_FALSE;
483
484
/*
485
* If we haven't yet loaded the indirect vdevs' mappings, we
486
* can only read from blocks that we carefully ensure are on
487
* concrete vdevs (or previously-loaded indirect vdevs). So we
488
* can't allow the predictive prefetcher to attempt reads of other
489
* blocks (e.g. of the MOS's dnode object).
490
*/
491
if (!spa_indirect_vdevs_loaded(spa))
492
return (NULL);
493
494
/*
495
* As a fast path for small (single-block) files, ignore access
496
* to the first block.
497
*/
498
if (!have_lock && blkid == 0)
499
return (NULL);
500
501
if (!have_lock)
502
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
503
504
/*
505
* A fast path for small files for which no prefetch will
506
* happen.
507
*/
508
uint64_t maxblkid = zf->zf_dnode->dn_maxblkid;
509
if (maxblkid < 2) {
510
if (!have_lock)
511
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
512
return (NULL);
513
}
514
mutex_enter(&zf->zf_lock);
515
516
/*
517
* Find perfect prefetch stream. Depending on whether the accesses
518
* are block-aligned, first block of the new access may either follow
519
* the last block of the previous access, or be equal to it.
520
*/
521
unsigned int dbs = zf->zf_dnode->dn_datablkshift;
522
uint64_t end_blkid = blkid + nblks;
523
for (zs = list_head(&zf->zf_stream); zs != NULL;
524
zs = list_next(&zf->zf_stream, zs)) {
525
if (blkid == zs->zs_blkid) {
526
goto hit;
527
} else if (blkid + 1 == zs->zs_blkid) {
528
blkid++;
529
nblks--;
530
goto hit;
531
}
532
}
533
534
/*
535
* Find close enough prefetch stream. Access crossing stream position
536
* is a hit in its new part. Access ahead of stream position considered
537
* a hit for metadata prefetch, since we do not care about fill percent,
538
* or stored for future otherwise. Access behind stream position is
539
* silently ignored, since we already skipped it reaching fill percent.
540
*/
541
uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
542
uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
543
for (zs = list_head(&zf->zf_stream); zs != NULL;
544
zs = list_next(&zf->zf_stream, zs)) {
545
if (blkid > zs->zs_blkid) {
546
if (end_blkid <= zs->zs_blkid + max_reorder) {
547
if (!fetch_data) {
548
nblks = dmu_zfetch_hit(zs,
549
end_blkid - zs->zs_blkid);
550
ZFETCHSTAT_BUMP(zfetchstat_stride);
551
goto future;
552
}
553
nblks = dmu_zfetch_future(zs, blkid, nblks);
554
if (nblks > 0)
555
ZFETCHSTAT_BUMP(zfetchstat_stride);
556
else
557
ZFETCHSTAT_BUMP(zfetchstat_future);
558
goto future;
559
}
560
} else if (end_blkid >= zs->zs_blkid) {
561
nblks -= zs->zs_blkid - blkid;
562
blkid += zs->zs_blkid - blkid;
563
goto hit;
564
} else if (end_blkid + max_reorder > zs->zs_blkid &&
565
(int)(zs->zs_atime - t) >= 0) {
566
ZFETCHSTAT_BUMP(zfetchstat_past);
567
zs->zs_atime = gethrestime_sec();
568
goto out;
569
}
570
}
571
572
/*
573
* This access is not part of any existing stream. Create a new
574
* stream for it unless we are at the end of file.
575
*/
576
ASSERT0P(zs);
577
if (end_blkid < maxblkid)
578
dmu_zfetch_stream_create(zf, end_blkid);
579
mutex_exit(&zf->zf_lock);
580
ZFETCHSTAT_BUMP(zfetchstat_misses);
581
ipf_start = 0;
582
goto prescient;
583
584
hit:
585
nblks = dmu_zfetch_hit(zs, nblks);
586
ZFETCHSTAT_BUMP(zfetchstat_hits);
587
588
future:
589
zs->zs_atime = gethrestime_sec();
590
591
/* Exit if we already prefetched for this position before. */
592
if (nblks == 0)
593
goto out;
594
595
/* If the file is ending, remove the stream. */
596
end_blkid = zs->zs_blkid;
597
if (end_blkid >= maxblkid) {
598
dmu_zfetch_stream_remove(zf, zs);
599
out:
600
mutex_exit(&zf->zf_lock);
601
if (!have_lock)
602
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
603
return (NULL);
604
}
605
606
/*
607
* This access was to a block that we issued a prefetch for on
608
* behalf of this stream. Calculate further prefetch distances.
609
*
610
* Start prefetch from the demand access size (nblks). Double the
611
* distance every access up to zfetch_min_distance. After that only
612
* if needed increase the distance by 1/8 up to zfetch_max_distance.
613
*
614
* Don't double the distance beyond single block if we have more
615
* than ~6% of ARC held by active prefetches. It should help with
616
* getting out of RAM on some badly mispredicted read patterns.
617
*/
618
unsigned int nbytes = nblks << dbs;
619
unsigned int pf_nblks;
620
if (fetch_data) {
621
if (unlikely(zs->zs_pf_dist < nbytes))
622
zs->zs_pf_dist = nbytes;
623
else if (zs->zs_pf_dist < zfetch_min_distance &&
624
(zs->zs_pf_dist < (1 << dbs) ||
625
aggsum_compare(&zfetch_sums.zfetchstat_io_active,
626
arc_c_max >> (4 + dbs)) < 0))
627
zs->zs_pf_dist *= 2;
628
else if (zs->zs_more)
629
zs->zs_pf_dist += zs->zs_pf_dist / 8;
630
zs->zs_more = B_FALSE;
631
if (zs->zs_pf_dist > zfetch_max_distance)
632
zs->zs_pf_dist = zfetch_max_distance;
633
pf_nblks = zs->zs_pf_dist >> dbs;
634
} else {
635
pf_nblks = 0;
636
}
637
if (zs->zs_pf_start < end_blkid)
638
zs->zs_pf_start = end_blkid;
639
if (zs->zs_pf_end < end_blkid + pf_nblks)
640
zs->zs_pf_end = end_blkid + pf_nblks;
641
642
/*
643
* Do the same for indirects, starting where we will stop reading
644
* data blocks (and the indirects that point to them).
645
*/
646
if (unlikely(zs->zs_ipf_dist < nbytes))
647
zs->zs_ipf_dist = nbytes;
648
else
649
zs->zs_ipf_dist *= 2;
650
if (zs->zs_ipf_dist > zfetch_max_idistance)
651
zs->zs_ipf_dist = zfetch_max_idistance;
652
pf_nblks = zs->zs_ipf_dist >> dbs;
653
if (zs->zs_ipf_start < zs->zs_pf_end)
654
zs->zs_ipf_start = zs->zs_pf_end;
655
ipf_start = zs->zs_ipf_end;
656
if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
657
zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
658
659
zfs_refcount_add(&zs->zs_refs, NULL);
660
/* Count concurrent callers. */
661
zfs_refcount_add(&zs->zs_callers, NULL);
662
mutex_exit(&zf->zf_lock);
663
664
prescient:
665
/*
666
* Prefetch the following indirect blocks for this access to reduce
667
* dbuf_hold() sync read delays in dmu_buf_hold_array_by_dnode().
668
* This covers the gap during the first couple accesses when we can
669
* not predict the future yet, but know what is needed right now.
670
* This should be very rare for reads/writes to need more than one
671
* indirect, but more useful for cloning due to much bigger accesses.
672
*/
673
ipf_start = MAX(ipf_start, blkid + 1);
674
int epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
675
ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
676
ipf_end = P2ROUNDUP(end_blkid, 1 << epbs) >> epbs;
677
678
int issued = 0;
679
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
680
issued += dbuf_prefetch(zf->zf_dnode, 1, iblk,
681
ZIO_PRIORITY_SYNC_READ, ARC_FLAG_PRESCIENT_PREFETCH);
682
}
683
684
if (!have_lock)
685
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
686
if (issued)
687
ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
688
return (zs);
689
}
690
691
void
692
dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
693
boolean_t have_lock, boolean_t uncached)
694
{
695
int64_t pf_start, pf_end, ipf_start, ipf_end;
696
int epbs, issued;
697
698
if (missed)
699
zs->zs_missed = missed;
700
701
/*
702
* Postpone the prefetch if there are more concurrent callers.
703
* It happens when multiple requests are waiting for the same
704
* indirect block. The last one will run the prefetch for all.
705
*/
706
if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
707
/* Drop reference taken in dmu_zfetch_prepare(). */
708
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
709
dmu_zfetch_stream_fini(zs);
710
return;
711
}
712
713
mutex_enter(&zf->zf_lock);
714
if (zs->zs_missed) {
715
pf_start = zs->zs_pf_start;
716
pf_end = zs->zs_pf_start = zs->zs_pf_end;
717
} else {
718
pf_start = pf_end = 0;
719
}
720
ipf_start = zs->zs_ipf_start;
721
ipf_end = zs->zs_ipf_start = zs->zs_ipf_end;
722
mutex_exit(&zf->zf_lock);
723
ASSERT3S(pf_start, <=, pf_end);
724
ASSERT3S(ipf_start, <=, ipf_end);
725
726
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
727
ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
728
ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
729
ASSERT3S(ipf_start, <=, ipf_end);
730
issued = pf_end - pf_start + ipf_end - ipf_start;
731
if (issued > 1) {
732
/* More references on top of taken in dmu_zfetch_prepare(). */
733
zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL);
734
} else if (issued == 0) {
735
/* Some other thread has done our work, so drop the ref. */
736
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
737
dmu_zfetch_stream_fini(zs);
738
return;
739
}
740
aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);
741
742
if (!have_lock)
743
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
744
745
issued = 0;
746
for (int64_t blk = pf_start; blk < pf_end; blk++) {
747
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
748
ZIO_PRIORITY_ASYNC_READ, uncached ?
749
ARC_FLAG_UNCACHED : 0, dmu_zfetch_done, zs);
750
}
751
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
752
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
753
ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
754
}
755
756
if (!have_lock)
757
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
758
759
if (issued)
760
ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
761
}
762
763
void
764
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
765
boolean_t missed, boolean_t have_lock, boolean_t uncached)
766
{
767
zstream_t *zs;
768
769
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
770
if (zs)
771
dmu_zfetch_run(zf, zs, missed, have_lock, uncached);
772
}
773
774
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
775
"Disable all ZFS prefetching");
776
777
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
778
"Max number of streams per zfetch");
779
780
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
781
"Min time before stream reclaim");
782
783
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW,
784
"Max time before stream delete");
785
786
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW,
787
"Min bytes to prefetch per stream");
788
789
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
790
"Max bytes to prefetch per stream");
791
792
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
793
"Max bytes to prefetch indirects for per stream");
794
795
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
796
"Max request reorder distance within a stream");
797
798
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
799
"Max log2 fraction of holes in a stream");
800
801