Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/compression.c
49174 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2008 Oracle. All rights reserved.
4
*/
5
6
#include <linux/kernel.h>
7
#include <linux/bio.h>
8
#include <linux/file.h>
9
#include <linux/fs.h>
10
#include <linux/pagemap.h>
11
#include <linux/pagevec.h>
12
#include <linux/highmem.h>
13
#include <linux/kthread.h>
14
#include <linux/time.h>
15
#include <linux/init.h>
16
#include <linux/string.h>
17
#include <linux/backing-dev.h>
18
#include <linux/writeback.h>
19
#include <linux/psi.h>
20
#include <linux/slab.h>
21
#include <linux/sched/mm.h>
22
#include <linux/log2.h>
23
#include <linux/shrinker.h>
24
#include <crypto/hash.h>
25
#include "misc.h"
26
#include "ctree.h"
27
#include "fs.h"
28
#include "btrfs_inode.h"
29
#include "bio.h"
30
#include "ordered-data.h"
31
#include "compression.h"
32
#include "extent_io.h"
33
#include "extent_map.h"
34
#include "subpage.h"
35
#include "messages.h"
36
#include "super.h"
37
38
static struct bio_set btrfs_compressed_bioset;
39
40
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
41
42
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
43
{
44
switch (type) {
45
case BTRFS_COMPRESS_ZLIB:
46
case BTRFS_COMPRESS_LZO:
47
case BTRFS_COMPRESS_ZSTD:
48
case BTRFS_COMPRESS_NONE:
49
return btrfs_compress_types[type];
50
default:
51
break;
52
}
53
54
return NULL;
55
}
56
57
static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
58
{
59
return container_of(bbio, struct compressed_bio, bbio);
60
}
61
62
static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
63
u64 start, blk_opf_t op,
64
btrfs_bio_end_io_t end_io)
65
{
66
struct btrfs_bio *bbio;
67
68
bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
69
GFP_NOFS, &btrfs_compressed_bioset));
70
btrfs_bio_init(bbio, inode, start, end_io, NULL);
71
return to_compressed_bio(bbio);
72
}
73
74
bool btrfs_compress_is_valid_type(const char *str, size_t len)
75
{
76
int i;
77
78
for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
79
size_t comp_len = strlen(btrfs_compress_types[i]);
80
81
if (len < comp_len)
82
continue;
83
84
if (!strncmp(btrfs_compress_types[i], str, comp_len))
85
return true;
86
}
87
return false;
88
}
89
90
static int compression_compress_pages(int type, struct list_head *ws,
91
struct btrfs_inode *inode, u64 start,
92
struct folio **folios, unsigned long *out_folios,
93
unsigned long *total_in, unsigned long *total_out)
94
{
95
switch (type) {
96
case BTRFS_COMPRESS_ZLIB:
97
return zlib_compress_folios(ws, inode, start, folios,
98
out_folios, total_in, total_out);
99
case BTRFS_COMPRESS_LZO:
100
return lzo_compress_folios(ws, inode, start, folios,
101
out_folios, total_in, total_out);
102
case BTRFS_COMPRESS_ZSTD:
103
return zstd_compress_folios(ws, inode, start, folios,
104
out_folios, total_in, total_out);
105
case BTRFS_COMPRESS_NONE:
106
default:
107
/*
108
* This can happen when compression races with remount setting
109
* it to 'no compress', while caller doesn't call
110
* inode_need_compress() to check if we really need to
111
* compress.
112
*
113
* Not a big deal, just need to inform caller that we
114
* haven't allocated any pages yet.
115
*/
116
*out_folios = 0;
117
return -E2BIG;
118
}
119
}
120
121
static int compression_decompress_bio(struct list_head *ws,
122
struct compressed_bio *cb)
123
{
124
switch (cb->compress_type) {
125
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
126
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
127
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
128
case BTRFS_COMPRESS_NONE:
129
default:
130
/*
131
* This can't happen, the type is validated several times
132
* before we get here.
133
*/
134
BUG();
135
}
136
}
137
138
static int compression_decompress(int type, struct list_head *ws,
139
const u8 *data_in, struct folio *dest_folio,
140
unsigned long dest_pgoff, size_t srclen, size_t destlen)
141
{
142
switch (type) {
143
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio,
144
dest_pgoff, srclen, destlen);
145
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio,
146
dest_pgoff, srclen, destlen);
147
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio,
148
dest_pgoff, srclen, destlen);
149
case BTRFS_COMPRESS_NONE:
150
default:
151
/*
152
* This can't happen, the type is validated several times
153
* before we get here.
154
*/
155
BUG();
156
}
157
}
158
159
static void btrfs_free_compressed_folios(struct compressed_bio *cb)
160
{
161
for (unsigned int i = 0; i < cb->nr_folios; i++)
162
btrfs_free_compr_folio(cb->compressed_folios[i]);
163
kfree(cb->compressed_folios);
164
}
165
166
static int btrfs_decompress_bio(struct compressed_bio *cb);
167
168
/*
169
* Global cache of last unused pages for compression/decompression.
170
*/
171
static struct btrfs_compr_pool {
172
struct shrinker *shrinker;
173
spinlock_t lock;
174
struct list_head list;
175
int count;
176
int thresh;
177
} compr_pool;
178
179
static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
180
{
181
int ret;
182
183
/*
184
* We must not read the values more than once if 'ret' gets expanded in
185
* the return statement so we don't accidentally return a negative
186
* number, even if the first condition finds it positive.
187
*/
188
ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
189
190
return ret > 0 ? ret : 0;
191
}
192
193
static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
194
{
195
LIST_HEAD(remove);
196
struct list_head *tmp, *next;
197
int freed;
198
199
if (compr_pool.count == 0)
200
return SHRINK_STOP;
201
202
/* For now, just simply drain the whole list. */
203
spin_lock(&compr_pool.lock);
204
list_splice_init(&compr_pool.list, &remove);
205
freed = compr_pool.count;
206
compr_pool.count = 0;
207
spin_unlock(&compr_pool.lock);
208
209
list_for_each_safe(tmp, next, &remove) {
210
struct page *page = list_entry(tmp, struct page, lru);
211
212
ASSERT(page_ref_count(page) == 1);
213
put_page(page);
214
}
215
216
return freed;
217
}
218
219
/*
220
* Common wrappers for page allocation from compression wrappers
221
*/
222
struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info)
223
{
224
struct folio *folio = NULL;
225
226
/* For bs > ps cases, no cached folio pool for now. */
227
if (fs_info->block_min_order)
228
goto alloc;
229
230
spin_lock(&compr_pool.lock);
231
if (compr_pool.count > 0) {
232
folio = list_first_entry(&compr_pool.list, struct folio, lru);
233
list_del_init(&folio->lru);
234
compr_pool.count--;
235
}
236
spin_unlock(&compr_pool.lock);
237
238
if (folio)
239
return folio;
240
241
alloc:
242
return folio_alloc(GFP_NOFS, fs_info->block_min_order);
243
}
244
245
void btrfs_free_compr_folio(struct folio *folio)
246
{
247
bool do_free = false;
248
249
/* The folio is from bs > ps fs, no cached pool for now. */
250
if (folio_order(folio))
251
goto free;
252
253
spin_lock(&compr_pool.lock);
254
if (compr_pool.count > compr_pool.thresh) {
255
do_free = true;
256
} else {
257
list_add(&folio->lru, &compr_pool.list);
258
compr_pool.count++;
259
}
260
spin_unlock(&compr_pool.lock);
261
262
if (!do_free)
263
return;
264
265
free:
266
ASSERT(folio_ref_count(folio) == 1);
267
folio_put(folio);
268
}
269
270
static void end_bbio_compressed_read(struct btrfs_bio *bbio)
271
{
272
struct compressed_bio *cb = to_compressed_bio(bbio);
273
blk_status_t status = bbio->bio.bi_status;
274
275
if (!status)
276
status = errno_to_blk_status(btrfs_decompress_bio(cb));
277
278
btrfs_free_compressed_folios(cb);
279
btrfs_bio_end_io(cb->orig_bbio, status);
280
bio_put(&bbio->bio);
281
}
282
283
/*
284
* Clear the writeback bits on all of the file
285
* pages for a compressed write
286
*/
287
static noinline void end_compressed_writeback(const struct compressed_bio *cb)
288
{
289
struct inode *inode = &cb->bbio.inode->vfs_inode;
290
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
291
pgoff_t index = cb->start >> PAGE_SHIFT;
292
const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
293
struct folio_batch fbatch;
294
int i;
295
int ret;
296
297
ret = blk_status_to_errno(cb->bbio.bio.bi_status);
298
if (ret)
299
mapping_set_error(inode->i_mapping, ret);
300
301
folio_batch_init(&fbatch);
302
while (index <= end_index) {
303
ret = filemap_get_folios(inode->i_mapping, &index, end_index,
304
&fbatch);
305
306
if (ret == 0)
307
return;
308
309
for (i = 0; i < ret; i++) {
310
struct folio *folio = fbatch.folios[i];
311
312
btrfs_folio_clamp_clear_writeback(fs_info, folio,
313
cb->start, cb->len);
314
}
315
folio_batch_release(&fbatch);
316
}
317
/* the inode may be gone now */
318
}
319
320
/*
321
* Do the cleanup once all the compressed pages hit the disk. This will clear
322
* writeback on the file pages and free the compressed pages.
323
*
324
* This also calls the writeback end hooks for the file pages so that metadata
325
* and checksums can be updated in the file.
326
*/
327
static void end_bbio_compressed_write(struct btrfs_bio *bbio)
328
{
329
struct compressed_bio *cb = to_compressed_bio(bbio);
330
331
btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
332
cb->bbio.bio.bi_status == BLK_STS_OK);
333
334
if (cb->writeback)
335
end_compressed_writeback(cb);
336
/* Note, our inode could be gone now. */
337
btrfs_free_compressed_folios(cb);
338
bio_put(&cb->bbio.bio);
339
}
340
341
static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
342
{
343
struct bio *bio = &cb->bbio.bio;
344
u32 offset = 0;
345
unsigned int findex = 0;
346
347
while (offset < cb->compressed_len) {
348
struct folio *folio = cb->compressed_folios[findex];
349
u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio));
350
int ret;
351
352
/* Maximum compressed extent is smaller than bio size limit. */
353
ret = bio_add_folio(bio, folio, len, 0);
354
ASSERT(ret);
355
offset += len;
356
findex++;
357
}
358
}
359
360
/*
361
* worker function to build and submit bios for previously compressed pages.
362
* The corresponding pages in the inode should be marked for writeback
363
* and the compressed pages should have a reference on them for dropping
364
* when the IO is complete.
365
*
366
* This also checksums the file bytes and gets things ready for
367
* the end io hooks.
368
*/
369
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
370
struct folio **compressed_folios,
371
unsigned int nr_folios,
372
blk_opf_t write_flags,
373
bool writeback)
374
{
375
struct btrfs_inode *inode = ordered->inode;
376
struct btrfs_fs_info *fs_info = inode->root->fs_info;
377
struct compressed_bio *cb;
378
379
ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize));
380
ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize));
381
382
cb = alloc_compressed_bio(inode, ordered->file_offset,
383
REQ_OP_WRITE | write_flags,
384
end_bbio_compressed_write);
385
cb->start = ordered->file_offset;
386
cb->len = ordered->num_bytes;
387
cb->compressed_folios = compressed_folios;
388
cb->compressed_len = ordered->disk_num_bytes;
389
cb->writeback = writeback;
390
cb->nr_folios = nr_folios;
391
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
392
cb->bbio.ordered = ordered;
393
btrfs_add_compressed_bio_folios(cb);
394
395
btrfs_submit_bbio(&cb->bbio, 0);
396
}
397
398
/*
399
* Add extra pages in the same compressed file extent so that we don't need to
400
* re-read the same extent again and again.
401
*
402
* NOTE: this won't work well for subpage, as for subpage read, we lock the
403
* full page then submit bio for each compressed/regular extents.
404
*
405
* This means, if we have several sectors in the same page points to the same
406
* on-disk compressed data, we will re-read the same extent many times and
407
* this function can only help for the next page.
408
*/
409
static noinline int add_ra_bio_pages(struct inode *inode,
410
u64 compressed_end,
411
struct compressed_bio *cb,
412
int *memstall, unsigned long *pflags)
413
{
414
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
415
pgoff_t end_index;
416
struct bio *orig_bio = &cb->orig_bbio->bio;
417
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
418
u64 isize = i_size_read(inode);
419
int ret;
420
struct folio *folio;
421
struct extent_map *em;
422
struct address_space *mapping = inode->i_mapping;
423
struct extent_map_tree *em_tree;
424
struct extent_io_tree *tree;
425
int sectors_missed = 0;
426
427
em_tree = &BTRFS_I(inode)->extent_tree;
428
tree = &BTRFS_I(inode)->io_tree;
429
430
if (isize == 0)
431
return 0;
432
433
/*
434
* For current subpage support, we only support 64K page size,
435
* which means maximum compressed extent size (128K) is just 2x page
436
* size.
437
* This makes readahead less effective, so here disable readahead for
438
* subpage for now, until full compressed write is supported.
439
*/
440
if (fs_info->sectorsize < PAGE_SIZE)
441
return 0;
442
443
/* For bs > ps cases, we don't support readahead for compressed folios for now. */
444
if (fs_info->block_min_order)
445
return 0;
446
447
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
448
449
while (cur < compressed_end) {
450
pgoff_t page_end;
451
pgoff_t pg_index = cur >> PAGE_SHIFT;
452
u32 add_size;
453
454
if (pg_index > end_index)
455
break;
456
457
folio = filemap_get_folio(mapping, pg_index);
458
if (!IS_ERR(folio)) {
459
u64 folio_sz = folio_size(folio);
460
u64 offset = offset_in_folio(folio, cur);
461
462
folio_put(folio);
463
sectors_missed += (folio_sz - offset) >>
464
fs_info->sectorsize_bits;
465
466
/* Beyond threshold, no need to continue */
467
if (sectors_missed > 4)
468
break;
469
470
/*
471
* Jump to next page start as we already have page for
472
* current offset.
473
*/
474
cur += (folio_sz - offset);
475
continue;
476
}
477
478
folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS),
479
0, NULL);
480
if (!folio)
481
break;
482
483
if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) {
484
/* There is already a page, skip to page end */
485
cur += folio_size(folio);
486
folio_put(folio);
487
continue;
488
}
489
490
if (!*memstall && folio_test_workingset(folio)) {
491
psi_memstall_enter(pflags);
492
*memstall = 1;
493
}
494
495
ret = set_folio_extent_mapped(folio);
496
if (ret < 0) {
497
folio_unlock(folio);
498
folio_put(folio);
499
break;
500
}
501
502
page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
503
btrfs_lock_extent(tree, cur, page_end, NULL);
504
read_lock(&em_tree->lock);
505
em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
506
read_unlock(&em_tree->lock);
507
508
/*
509
* At this point, we have a locked page in the page cache for
510
* these bytes in the file. But, we have to make sure they map
511
* to this compressed extent on disk.
512
*/
513
if (!em || cur < em->start ||
514
(cur + fs_info->sectorsize > btrfs_extent_map_end(em)) ||
515
(btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
516
orig_bio->bi_iter.bi_sector) {
517
btrfs_free_extent_map(em);
518
btrfs_unlock_extent(tree, cur, page_end, NULL);
519
folio_unlock(folio);
520
folio_put(folio);
521
break;
522
}
523
add_size = min(em->start + em->len, page_end + 1) - cur;
524
btrfs_free_extent_map(em);
525
btrfs_unlock_extent(tree, cur, page_end, NULL);
526
527
if (folio_contains(folio, end_index)) {
528
size_t zero_offset = offset_in_folio(folio, isize);
529
530
if (zero_offset) {
531
int zeros;
532
zeros = folio_size(folio) - zero_offset;
533
folio_zero_range(folio, zero_offset, zeros);
534
}
535
}
536
537
if (!bio_add_folio(orig_bio, folio, add_size,
538
offset_in_folio(folio, cur))) {
539
folio_unlock(folio);
540
folio_put(folio);
541
break;
542
}
543
/*
544
* If it's subpage, we also need to increase its
545
* subpage::readers number, as at endio we will decrease
546
* subpage::readers and to unlock the page.
547
*/
548
if (fs_info->sectorsize < PAGE_SIZE)
549
btrfs_folio_set_lock(fs_info, folio, cur, add_size);
550
folio_put(folio);
551
cur += add_size;
552
}
553
return 0;
554
}
555
556
/*
557
* for a compressed read, the bio we get passed has all the inode pages
558
* in it. We don't actually do IO on those pages but allocate new ones
559
* to hold the compressed pages on disk.
560
*
561
* bio->bi_iter.bi_sector points to the compressed extent on disk
562
* bio->bi_io_vec points to all of the inode pages
563
*
564
* After the compressed pages are read, we copy the bytes into the
565
* bio we were passed and then call the bio end_io calls
566
*/
567
void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
568
{
569
struct btrfs_inode *inode = bbio->inode;
570
struct btrfs_fs_info *fs_info = inode->root->fs_info;
571
struct extent_map_tree *em_tree = &inode->extent_tree;
572
struct compressed_bio *cb;
573
unsigned int compressed_len;
574
u64 file_offset = bbio->file_offset;
575
u64 em_len;
576
u64 em_start;
577
struct extent_map *em;
578
unsigned long pflags;
579
int memstall = 0;
580
blk_status_t status;
581
int ret;
582
583
/* we need the actual starting offset of this extent in the file */
584
read_lock(&em_tree->lock);
585
em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
586
read_unlock(&em_tree->lock);
587
if (!em) {
588
status = BLK_STS_IOERR;
589
goto out;
590
}
591
592
ASSERT(btrfs_extent_map_is_compressed(em));
593
compressed_len = em->disk_num_bytes;
594
595
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
596
end_bbio_compressed_read);
597
598
cb->start = em->start - em->offset;
599
em_len = em->len;
600
em_start = em->start;
601
602
cb->len = bbio->bio.bi_iter.bi_size;
603
cb->compressed_len = compressed_len;
604
cb->compress_type = btrfs_extent_map_compression(em);
605
cb->orig_bbio = bbio;
606
cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root;
607
608
btrfs_free_extent_map(em);
609
610
cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info));
611
cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
612
if (!cb->compressed_folios) {
613
status = BLK_STS_RESOURCE;
614
goto out_free_bio;
615
}
616
617
ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order,
618
cb->compressed_folios);
619
if (ret) {
620
status = BLK_STS_RESOURCE;
621
goto out_free_compressed_pages;
622
}
623
624
add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
625
&pflags);
626
627
/* include any pages we added in add_ra-bio_pages */
628
cb->len = bbio->bio.bi_iter.bi_size;
629
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
630
btrfs_add_compressed_bio_folios(cb);
631
632
if (memstall)
633
psi_memstall_leave(&pflags);
634
635
btrfs_submit_bbio(&cb->bbio, 0);
636
return;
637
638
out_free_compressed_pages:
639
kfree(cb->compressed_folios);
640
out_free_bio:
641
bio_put(&cb->bbio.bio);
642
out:
643
btrfs_bio_end_io(bbio, status);
644
}
645
646
/*
647
* Heuristic uses systematic sampling to collect data from the input data
648
* range, the logic can be tuned by the following constants:
649
*
650
* @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
651
* @SAMPLING_INTERVAL - range from which the sampled data can be collected
652
*/
653
#define SAMPLING_READ_SIZE (16)
654
#define SAMPLING_INTERVAL (256)
655
656
/*
657
* For statistical analysis of the input data we consider bytes that form a
658
* Galois Field of 256 objects. Each object has an attribute count, ie. how
659
* many times the object appeared in the sample.
660
*/
661
#define BUCKET_SIZE (256)
662
663
/*
664
* The size of the sample is based on a statistical sampling rule of thumb.
665
* The common way is to perform sampling tests as long as the number of
666
* elements in each cell is at least 5.
667
*
668
* Instead of 5, we choose 32 to obtain more accurate results.
669
* If the data contain the maximum number of symbols, which is 256, we obtain a
670
* sample size bound by 8192.
671
*
672
* For a sample of at most 8KB of data per data range: 16 consecutive bytes
673
* from up to 512 locations.
674
*/
675
#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
676
SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
677
678
struct bucket_item {
679
u32 count;
680
};
681
682
struct heuristic_ws {
683
/* Partial copy of input data */
684
u8 *sample;
685
u32 sample_size;
686
/* Buckets store counters for each byte value */
687
struct bucket_item *bucket;
688
/* Sorting buffer */
689
struct bucket_item *bucket_b;
690
struct list_head list;
691
};
692
693
static void free_heuristic_ws(struct list_head *ws)
694
{
695
struct heuristic_ws *workspace;
696
697
workspace = list_entry(ws, struct heuristic_ws, list);
698
699
kvfree(workspace->sample);
700
kfree(workspace->bucket);
701
kfree(workspace->bucket_b);
702
kfree(workspace);
703
}
704
705
static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info)
706
{
707
struct heuristic_ws *ws;
708
709
ws = kzalloc(sizeof(*ws), GFP_KERNEL);
710
if (!ws)
711
return ERR_PTR(-ENOMEM);
712
713
ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
714
if (!ws->sample)
715
goto fail;
716
717
ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
718
if (!ws->bucket)
719
goto fail;
720
721
ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
722
if (!ws->bucket_b)
723
goto fail;
724
725
INIT_LIST_HEAD(&ws->list);
726
return &ws->list;
727
fail:
728
free_heuristic_ws(&ws->list);
729
return ERR_PTR(-ENOMEM);
730
}
731
732
const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 };
733
734
static const struct btrfs_compress_levels * const btrfs_compress_levels[] = {
735
/* The heuristic is represented as compression type 0 */
736
&btrfs_heuristic_compress,
737
&btrfs_zlib_compress,
738
&btrfs_lzo_compress,
739
&btrfs_zstd_compress,
740
};
741
742
static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level)
743
{
744
switch (type) {
745
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info);
746
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level);
747
case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info);
748
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level);
749
default:
750
/*
751
* This can't happen, the type is validated several times
752
* before we get here.
753
*/
754
BUG();
755
}
756
}
757
758
static void free_workspace(int type, struct list_head *ws)
759
{
760
switch (type) {
761
case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
762
case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
763
case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
764
case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
765
default:
766
/*
767
* This can't happen, the type is validated several times
768
* before we get here.
769
*/
770
BUG();
771
}
772
}
773
774
static int alloc_workspace_manager(struct btrfs_fs_info *fs_info,
775
enum btrfs_compression_type type)
776
{
777
struct workspace_manager *gwsm;
778
struct list_head *workspace;
779
780
ASSERT(fs_info->compr_wsm[type] == NULL);
781
gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL);
782
if (!gwsm)
783
return -ENOMEM;
784
785
INIT_LIST_HEAD(&gwsm->idle_ws);
786
spin_lock_init(&gwsm->ws_lock);
787
atomic_set(&gwsm->total_ws, 0);
788
init_waitqueue_head(&gwsm->ws_wait);
789
fs_info->compr_wsm[type] = gwsm;
790
791
/*
792
* Preallocate one workspace for each compression type so we can
793
* guarantee forward progress in the worst case
794
*/
795
workspace = alloc_workspace(fs_info, type, 0);
796
if (IS_ERR(workspace)) {
797
btrfs_warn(fs_info,
798
"cannot preallocate compression workspace for %s, will try later",
799
btrfs_compress_type2str(type));
800
} else {
801
atomic_set(&gwsm->total_ws, 1);
802
gwsm->free_ws = 1;
803
list_add(workspace, &gwsm->idle_ws);
804
}
805
return 0;
806
}
807
808
static void free_workspace_manager(struct btrfs_fs_info *fs_info,
809
enum btrfs_compression_type type)
810
{
811
struct list_head *ws;
812
struct workspace_manager *gwsm = fs_info->compr_wsm[type];
813
814
/* ZSTD uses its own workspace manager, should enter here. */
815
ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES);
816
if (!gwsm)
817
return;
818
fs_info->compr_wsm[type] = NULL;
819
while (!list_empty(&gwsm->idle_ws)) {
820
ws = gwsm->idle_ws.next;
821
list_del(ws);
822
free_workspace(type, ws);
823
atomic_dec(&gwsm->total_ws);
824
}
825
kfree(gwsm);
826
}
827
828
/*
829
* This finds an available workspace or allocates a new one.
830
* If it's not possible to allocate a new one, waits until there's one.
831
* Preallocation makes a forward progress guarantees and we do not return
832
* errors.
833
*/
834
struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
835
{
836
struct workspace_manager *wsm = fs_info->compr_wsm[type];
837
struct list_head *workspace;
838
int cpus = num_online_cpus();
839
unsigned nofs_flag;
840
struct list_head *idle_ws;
841
spinlock_t *ws_lock;
842
atomic_t *total_ws;
843
wait_queue_head_t *ws_wait;
844
int *free_ws;
845
846
ASSERT(wsm);
847
idle_ws = &wsm->idle_ws;
848
ws_lock = &wsm->ws_lock;
849
total_ws = &wsm->total_ws;
850
ws_wait = &wsm->ws_wait;
851
free_ws = &wsm->free_ws;
852
853
again:
854
spin_lock(ws_lock);
855
if (!list_empty(idle_ws)) {
856
workspace = idle_ws->next;
857
list_del(workspace);
858
(*free_ws)--;
859
spin_unlock(ws_lock);
860
return workspace;
861
862
}
863
if (atomic_read(total_ws) > cpus) {
864
DEFINE_WAIT(wait);
865
866
spin_unlock(ws_lock);
867
prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
868
if (atomic_read(total_ws) > cpus && !*free_ws)
869
schedule();
870
finish_wait(ws_wait, &wait);
871
goto again;
872
}
873
atomic_inc(total_ws);
874
spin_unlock(ws_lock);
875
876
/*
877
* Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
878
* to turn it off here because we might get called from the restricted
879
* context of btrfs_compress_bio/btrfs_compress_pages
880
*/
881
nofs_flag = memalloc_nofs_save();
882
workspace = alloc_workspace(fs_info, type, level);
883
memalloc_nofs_restore(nofs_flag);
884
885
if (IS_ERR(workspace)) {
886
atomic_dec(total_ws);
887
wake_up(ws_wait);
888
889
/*
890
* Do not return the error but go back to waiting. There's a
891
* workspace preallocated for each type and the compression
892
* time is bounded so we get to a workspace eventually. This
893
* makes our caller's life easier.
894
*
895
* To prevent silent and low-probability deadlocks (when the
896
* initial preallocation fails), check if there are any
897
* workspaces at all.
898
*/
899
if (atomic_read(total_ws) == 0) {
900
static DEFINE_RATELIMIT_STATE(_rs,
901
/* once per minute */ 60 * HZ,
902
/* no burst */ 1);
903
904
if (__ratelimit(&_rs))
905
btrfs_warn(fs_info,
906
"no compression workspaces, low memory, retrying");
907
}
908
goto again;
909
}
910
return workspace;
911
}
912
913
static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
914
{
915
switch (type) {
916
case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level);
917
case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level);
918
case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level);
919
case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level);
920
default:
921
/*
922
* This can't happen, the type is validated several times
923
* before we get here.
924
*/
925
BUG();
926
}
927
}
928
929
/*
930
* put a workspace struct back on the list or free it if we have enough
931
* idle ones sitting around
932
*/
933
void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
934
{
935
struct workspace_manager *gwsm = fs_info->compr_wsm[type];
936
struct list_head *idle_ws;
937
spinlock_t *ws_lock;
938
atomic_t *total_ws;
939
wait_queue_head_t *ws_wait;
940
int *free_ws;
941
942
ASSERT(gwsm);
943
idle_ws = &gwsm->idle_ws;
944
ws_lock = &gwsm->ws_lock;
945
total_ws = &gwsm->total_ws;
946
ws_wait = &gwsm->ws_wait;
947
free_ws = &gwsm->free_ws;
948
949
spin_lock(ws_lock);
950
if (*free_ws <= num_online_cpus()) {
951
list_add(ws, idle_ws);
952
(*free_ws)++;
953
spin_unlock(ws_lock);
954
goto wake;
955
}
956
spin_unlock(ws_lock);
957
958
free_workspace(type, ws);
959
atomic_dec(total_ws);
960
wake:
961
cond_wake_up(ws_wait);
962
}
963
964
static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
965
{
966
switch (type) {
967
case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws);
968
case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws);
969
case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws);
970
case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws);
971
default:
972
/*
973
* This can't happen, the type is validated several times
974
* before we get here.
975
*/
976
BUG();
977
}
978
}
979
980
/*
981
* Adjust @level according to the limits of the compression algorithm or
982
* fallback to default
983
*/
984
static int btrfs_compress_set_level(unsigned int type, int level)
985
{
986
const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
987
988
if (level == 0)
989
level = levels->default_level;
990
else
991
level = clamp(level, levels->min_level, levels->max_level);
992
993
return level;
994
}
995
996
/*
997
* Check whether the @level is within the valid range for the given type.
998
*/
999
bool btrfs_compress_level_valid(unsigned int type, int level)
1000
{
1001
const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
1002
1003
return levels->min_level <= level && level <= levels->max_level;
1004
}
1005
1006
/* Wrapper around find_get_page(), with extra error message. */
1007
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
1008
struct folio **in_folio_ret)
1009
{
1010
struct folio *in_folio;
1011
1012
/*
1013
* The compressed write path should have the folio locked already, thus
1014
* we only need to grab one reference.
1015
*/
1016
in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT);
1017
if (IS_ERR(in_folio)) {
1018
struct btrfs_inode *inode = BTRFS_I(mapping->host);
1019
1020
btrfs_crit(inode->root->fs_info,
1021
"failed to get page cache, root %lld ino %llu file offset %llu",
1022
btrfs_root_id(inode->root), btrfs_ino(inode), start);
1023
return -ENOENT;
1024
}
1025
*in_folio_ret = in_folio;
1026
return 0;
1027
}
1028
1029
/*
1030
* Given an address space and start and length, compress the bytes into @pages
1031
* that are allocated on demand.
1032
*
1033
* @type_level is encoded algorithm and level, where level 0 means whatever
1034
* default the algorithm chooses and is opaque here;
1035
* - compression algo are 0-3
1036
* - the level are bits 4-7
1037
*
1038
* @out_folios is an in/out parameter, holds maximum number of folios to allocate
1039
* and returns number of actually allocated folios
1040
*
1041
* @total_in is used to return the number of bytes actually read. It
1042
* may be smaller than the input length if we had to exit early because we
1043
* ran out of room in the folios array or because we cross the
1044
* max_out threshold.
1045
*
1046
* @total_out is an in/out parameter, must be set to the input length and will
1047
* be also used to return the total number of compressed bytes
1048
*/
1049
int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
1050
u64 start, struct folio **folios, unsigned long *out_folios,
1051
unsigned long *total_in, unsigned long *total_out)
1052
{
1053
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1054
const unsigned long orig_len = *total_out;
1055
struct list_head *workspace;
1056
int ret;
1057
1058
level = btrfs_compress_set_level(type, level);
1059
workspace = get_workspace(fs_info, type, level);
1060
ret = compression_compress_pages(type, workspace, inode, start, folios,
1061
out_folios, total_in, total_out);
1062
/* The total read-in bytes should be no larger than the input. */
1063
ASSERT(*total_in <= orig_len);
1064
put_workspace(fs_info, type, workspace);
1065
return ret;
1066
}
1067
1068
static int btrfs_decompress_bio(struct compressed_bio *cb)
1069
{
1070
struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
1071
struct list_head *workspace;
1072
int ret;
1073
int type = cb->compress_type;
1074
1075
workspace = get_workspace(fs_info, type, 0);
1076
ret = compression_decompress_bio(workspace, cb);
1077
put_workspace(fs_info, type, workspace);
1078
1079
if (!ret)
1080
zero_fill_bio(&cb->orig_bbio->bio);
1081
return ret;
1082
}
1083
1084
/*
1085
* a less complex decompression routine. Our compressed data fits in a
1086
* single page, and we want to read a single page out of it.
1087
* dest_pgoff tells us the offset into the destination folio where we write the
1088
* decompressed data.
1089
*/
1090
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
1091
unsigned long dest_pgoff, size_t srclen, size_t destlen)
1092
{
1093
struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
1094
struct list_head *workspace;
1095
const u32 sectorsize = fs_info->sectorsize;
1096
int ret;
1097
1098
/*
1099
* The full destination folio range should not exceed the folio size.
1100
* And the @destlen should not exceed sectorsize, as this is only called for
1101
* inline file extents, which should not exceed sectorsize.
1102
*/
1103
ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize);
1104
1105
workspace = get_workspace(fs_info, type, 0);
1106
ret = compression_decompress(type, workspace, data_in, dest_folio,
1107
dest_pgoff, srclen, destlen);
1108
put_workspace(fs_info, type, workspace);
1109
1110
return ret;
1111
}
1112
1113
int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info)
1114
{
1115
int ret;
1116
1117
ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
1118
if (ret < 0)
1119
goto error;
1120
ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
1121
if (ret < 0)
1122
goto error;
1123
ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
1124
if (ret < 0)
1125
goto error;
1126
ret = zstd_alloc_workspace_manager(fs_info);
1127
if (ret < 0)
1128
goto error;
1129
return 0;
1130
error:
1131
btrfs_free_compress_wsm(fs_info);
1132
return ret;
1133
}
1134
1135
void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info)
1136
{
1137
free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
1138
free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
1139
free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
1140
zstd_free_workspace_manager(fs_info);
1141
}
1142
1143
int __init btrfs_init_compress(void)
1144
{
1145
if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
1146
offsetof(struct compressed_bio, bbio.bio),
1147
BIOSET_NEED_BVECS))
1148
return -ENOMEM;
1149
1150
compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
1151
if (!compr_pool.shrinker)
1152
return -ENOMEM;
1153
1154
spin_lock_init(&compr_pool.lock);
1155
INIT_LIST_HEAD(&compr_pool.list);
1156
compr_pool.count = 0;
1157
/* 128K / 4K = 32, for 8 threads is 256 pages. */
1158
compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
1159
compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
1160
compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
1161
compr_pool.shrinker->batch = 32;
1162
compr_pool.shrinker->seeks = DEFAULT_SEEKS;
1163
shrinker_register(compr_pool.shrinker);
1164
1165
return 0;
1166
}
1167
1168
void __cold btrfs_exit_compress(void)
1169
{
1170
/* For now scan drains all pages and does not touch the parameters. */
1171
btrfs_compr_pool_scan(NULL, NULL);
1172
shrinker_free(compr_pool.shrinker);
1173
1174
bioset_exit(&btrfs_compressed_bioset);
1175
}
1176
1177
/*
1178
* The bvec is a single page bvec from a bio that contains folios from a filemap.
1179
*
1180
* Since the folio may be a large one, and if the bv_page is not a head page of
1181
* a large folio, then page->index is unreliable.
1182
*
1183
* Thus we need this helper to grab the proper file offset.
1184
*/
1185
static u64 file_offset_from_bvec(const struct bio_vec *bvec)
1186
{
1187
const struct page *page = bvec->bv_page;
1188
const struct folio *folio = page_folio(page);
1189
1190
return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
1191
}
1192
1193
/*
1194
* Copy decompressed data from working buffer to pages.
1195
*
1196
* @buf: The decompressed data buffer
1197
* @buf_len: The decompressed data length
1198
* @decompressed: Number of bytes that are already decompressed inside the
1199
* compressed extent
1200
* @cb: The compressed extent descriptor
1201
* @orig_bio: The original bio that the caller wants to read for
1202
*
1203
* An easier to understand graph is like below:
1204
*
1205
* |<- orig_bio ->| |<- orig_bio->|
1206
* |<------- full decompressed extent ----->|
1207
* |<----------- @cb range ---->|
1208
* | |<-- @buf_len -->|
1209
* |<--- @decompressed --->|
1210
*
1211
* Note that, @cb can be a subpage of the full decompressed extent, but
1212
* @cb->start always has the same as the orig_file_offset value of the full
1213
* decompressed extent.
1214
*
1215
* When reading compressed extent, we have to read the full compressed extent,
1216
* while @orig_bio may only want part of the range.
1217
* Thus this function will ensure only data covered by @orig_bio will be copied
1218
* to.
1219
*
1220
* Return 0 if we have copied all needed contents for @orig_bio.
1221
* Return >0 if we need continue decompress.
1222
*/
1223
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
1224
struct compressed_bio *cb, u32 decompressed)
1225
{
1226
struct bio *orig_bio = &cb->orig_bbio->bio;
1227
/* Offset inside the full decompressed extent */
1228
u32 cur_offset;
1229
1230
cur_offset = decompressed;
1231
/* The main loop to do the copy */
1232
while (cur_offset < decompressed + buf_len) {
1233
struct bio_vec bvec;
1234
size_t copy_len;
1235
u32 copy_start;
1236
/* Offset inside the full decompressed extent */
1237
u32 bvec_offset;
1238
void *kaddr;
1239
1240
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
1241
/*
1242
* cb->start may underflow, but subtracting that value can still
1243
* give us correct offset inside the full decompressed extent.
1244
*/
1245
bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
1246
1247
/* Haven't reached the bvec range, exit */
1248
if (decompressed + buf_len <= bvec_offset)
1249
return 1;
1250
1251
copy_start = max(cur_offset, bvec_offset);
1252
copy_len = min(bvec_offset + bvec.bv_len,
1253
decompressed + buf_len) - copy_start;
1254
ASSERT(copy_len);
1255
1256
/*
1257
* Extra range check to ensure we didn't go beyond
1258
* @buf + @buf_len.
1259
*/
1260
ASSERT(copy_start - decompressed < buf_len);
1261
1262
kaddr = bvec_kmap_local(&bvec);
1263
memcpy(kaddr, buf + copy_start - decompressed, copy_len);
1264
kunmap_local(kaddr);
1265
1266
cur_offset += copy_len;
1267
bio_advance(orig_bio, copy_len);
1268
/* Finished the bio */
1269
if (!orig_bio->bi_iter.bi_size)
1270
return 0;
1271
}
1272
return 1;
1273
}
1274
1275
/*
1276
* Shannon Entropy calculation
1277
*
1278
* Pure byte distribution analysis fails to determine compressibility of data.
1279
* Try calculating entropy to estimate the average minimum number of bits
1280
* needed to encode the sampled data.
1281
*
1282
* For convenience, return the percentage of needed bits, instead of amount of
1283
* bits directly.
1284
*
1285
* @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
1286
* and can be compressible with high probability
1287
*
1288
* @ENTROPY_LVL_HIGH - data are not compressible with high probability
1289
*
1290
* Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
1291
*/
1292
#define ENTROPY_LVL_ACEPTABLE (65)
1293
#define ENTROPY_LVL_HIGH (80)
1294
1295
/*
1296
* For increased precision in shannon_entropy calculation,
1297
* let's do pow(n, M) to save more digits after comma:
1298
*
1299
* - maximum int bit length is 64
1300
* - ilog2(MAX_SAMPLE_SIZE) -> 13
1301
* - 13 * 4 = 52 < 64 -> M = 4
1302
*
1303
* So use pow(n, 4).
1304
*/
1305
static inline u32 ilog2_w(u64 n)
1306
{
1307
return ilog2(n * n * n * n);
1308
}
1309
1310
static u32 shannon_entropy(struct heuristic_ws *ws)
1311
{
1312
const u32 entropy_max = 8 * ilog2_w(2);
1313
u32 entropy_sum = 0;
1314
u32 p, p_base, sz_base;
1315
u32 i;
1316
1317
sz_base = ilog2_w(ws->sample_size);
1318
for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
1319
p = ws->bucket[i].count;
1320
p_base = ilog2_w(p);
1321
entropy_sum += p * (sz_base - p_base);
1322
}
1323
1324
entropy_sum /= ws->sample_size;
1325
return entropy_sum * 100 / entropy_max;
1326
}
1327
1328
#define RADIX_BASE 4U
1329
#define COUNTERS_SIZE (1U << RADIX_BASE)
1330
1331
static u8 get4bits(u64 num, int shift) {
1332
u8 low4bits;
1333
1334
num >>= shift;
1335
/* Reverse order */
1336
low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
1337
return low4bits;
1338
}
1339
1340
/*
1341
* Use 4 bits as radix base
1342
* Use 16 u32 counters for calculating new position in buf array
1343
*
1344
* @array - array that will be sorted
1345
* @array_buf - buffer array to store sorting results
1346
* must be equal in size to @array
1347
* @num - array size
1348
*/
1349
static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
1350
int num)
1351
{
1352
u64 max_num;
1353
u64 buf_num;
1354
u32 counters[COUNTERS_SIZE];
1355
u32 new_addr;
1356
u32 addr;
1357
int bitlen;
1358
int shift;
1359
int i;
1360
1361
/*
1362
* Try avoid useless loop iterations for small numbers stored in big
1363
* counters. Example: 48 33 4 ... in 64bit array
1364
*/
1365
max_num = array[0].count;
1366
for (i = 1; i < num; i++) {
1367
buf_num = array[i].count;
1368
if (buf_num > max_num)
1369
max_num = buf_num;
1370
}
1371
1372
buf_num = ilog2(max_num);
1373
bitlen = ALIGN(buf_num, RADIX_BASE * 2);
1374
1375
shift = 0;
1376
while (shift < bitlen) {
1377
memset(counters, 0, sizeof(counters));
1378
1379
for (i = 0; i < num; i++) {
1380
buf_num = array[i].count;
1381
addr = get4bits(buf_num, shift);
1382
counters[addr]++;
1383
}
1384
1385
for (i = 1; i < COUNTERS_SIZE; i++)
1386
counters[i] += counters[i - 1];
1387
1388
for (i = num - 1; i >= 0; i--) {
1389
buf_num = array[i].count;
1390
addr = get4bits(buf_num, shift);
1391
counters[addr]--;
1392
new_addr = counters[addr];
1393
array_buf[new_addr] = array[i];
1394
}
1395
1396
shift += RADIX_BASE;
1397
1398
/*
1399
* Normal radix expects to move data from a temporary array, to
1400
* the main one. But that requires some CPU time. Avoid that
1401
* by doing another sort iteration to original array instead of
1402
* memcpy()
1403
*/
1404
memset(counters, 0, sizeof(counters));
1405
1406
for (i = 0; i < num; i ++) {
1407
buf_num = array_buf[i].count;
1408
addr = get4bits(buf_num, shift);
1409
counters[addr]++;
1410
}
1411
1412
for (i = 1; i < COUNTERS_SIZE; i++)
1413
counters[i] += counters[i - 1];
1414
1415
for (i = num - 1; i >= 0; i--) {
1416
buf_num = array_buf[i].count;
1417
addr = get4bits(buf_num, shift);
1418
counters[addr]--;
1419
new_addr = counters[addr];
1420
array[new_addr] = array_buf[i];
1421
}
1422
1423
shift += RADIX_BASE;
1424
}
1425
}
1426
1427
/*
1428
* Size of the core byte set - how many bytes cover 90% of the sample
1429
*
1430
* There are several types of structured binary data that use nearly all byte
1431
* values. The distribution can be uniform and counts in all buckets will be
1432
* nearly the same (eg. encrypted data). Unlikely to be compressible.
1433
*
1434
* Other possibility is normal (Gaussian) distribution, where the data could
1435
* be potentially compressible, but we have to take a few more steps to decide
1436
* how much.
1437
*
1438
* @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
1439
* compression algo can easy fix that
1440
* @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1441
* probability is not compressible
1442
*/
1443
#define BYTE_CORE_SET_LOW (64)
1444
#define BYTE_CORE_SET_HIGH (200)
1445
1446
static int byte_core_set_size(struct heuristic_ws *ws)
1447
{
1448
u32 i;
1449
u32 coreset_sum = 0;
1450
const u32 core_set_threshold = ws->sample_size * 90 / 100;
1451
struct bucket_item *bucket = ws->bucket;
1452
1453
/* Sort in reverse order */
1454
radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
1455
1456
for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1457
coreset_sum += bucket[i].count;
1458
1459
if (coreset_sum > core_set_threshold)
1460
return i;
1461
1462
for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1463
coreset_sum += bucket[i].count;
1464
if (coreset_sum > core_set_threshold)
1465
break;
1466
}
1467
1468
return i;
1469
}
1470
1471
/*
1472
* Count byte values in buckets.
1473
* This heuristic can detect textual data (configs, xml, json, html, etc).
1474
* Because in most text-like data byte set is restricted to limited number of
1475
* possible characters, and that restriction in most cases makes data easy to
1476
* compress.
1477
*
1478
* @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1479
* less - compressible
1480
* more - need additional analysis
1481
*/
1482
#define BYTE_SET_THRESHOLD (64)
1483
1484
static u32 byte_set_size(const struct heuristic_ws *ws)
1485
{
1486
u32 i;
1487
u32 byte_set_size = 0;
1488
1489
for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1490
if (ws->bucket[i].count > 0)
1491
byte_set_size++;
1492
}
1493
1494
/*
1495
* Continue collecting count of byte values in buckets. If the byte
1496
* set size is bigger then the threshold, it's pointless to continue,
1497
* the detection technique would fail for this type of data.
1498
*/
1499
for (; i < BUCKET_SIZE; i++) {
1500
if (ws->bucket[i].count > 0) {
1501
byte_set_size++;
1502
if (byte_set_size > BYTE_SET_THRESHOLD)
1503
return byte_set_size;
1504
}
1505
}
1506
1507
return byte_set_size;
1508
}
1509
1510
static bool sample_repeated_patterns(struct heuristic_ws *ws)
1511
{
1512
const u32 half_of_sample = ws->sample_size / 2;
1513
const u8 *data = ws->sample;
1514
1515
return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
1516
}
1517
1518
static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1519
struct heuristic_ws *ws)
1520
{
1521
struct page *page;
1522
pgoff_t index, index_end;
1523
u32 i, curr_sample_pos;
1524
u8 *in_data;
1525
1526
/*
1527
* Compression handles the input data by chunks of 128KiB
1528
* (defined by BTRFS_MAX_UNCOMPRESSED)
1529
*
1530
* We do the same for the heuristic and loop over the whole range.
1531
*
1532
* MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1533
* process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1534
*/
1535
if (end - start > BTRFS_MAX_UNCOMPRESSED)
1536
end = start + BTRFS_MAX_UNCOMPRESSED;
1537
1538
index = start >> PAGE_SHIFT;
1539
index_end = end >> PAGE_SHIFT;
1540
1541
/* Don't miss unaligned end */
1542
if (!PAGE_ALIGNED(end))
1543
index_end++;
1544
1545
curr_sample_pos = 0;
1546
while (index < index_end) {
1547
page = find_get_page(inode->i_mapping, index);
1548
in_data = kmap_local_page(page);
1549
/* Handle case where the start is not aligned to PAGE_SIZE */
1550
i = start % PAGE_SIZE;
1551
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1552
/* Don't sample any garbage from the last page */
1553
if (start > end - SAMPLING_READ_SIZE)
1554
break;
1555
memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1556
SAMPLING_READ_SIZE);
1557
i += SAMPLING_INTERVAL;
1558
start += SAMPLING_INTERVAL;
1559
curr_sample_pos += SAMPLING_READ_SIZE;
1560
}
1561
kunmap_local(in_data);
1562
put_page(page);
1563
1564
index++;
1565
}
1566
1567
ws->sample_size = curr_sample_pos;
1568
}
1569
1570
/*
1571
* Compression heuristic.
1572
*
1573
* The following types of analysis can be performed:
1574
* - detect mostly zero data
1575
* - detect data with low "byte set" size (text, etc)
1576
* - detect data with low/high "core byte" set
1577
*
1578
* Return non-zero if the compression should be done, 0 otherwise.
1579
*/
1580
int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
1581
{
1582
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1583
struct list_head *ws_list = get_workspace(fs_info, 0, 0);
1584
struct heuristic_ws *ws;
1585
u32 i;
1586
u8 byte;
1587
int ret = 0;
1588
1589
ws = list_entry(ws_list, struct heuristic_ws, list);
1590
1591
heuristic_collect_sample(&inode->vfs_inode, start, end, ws);
1592
1593
if (sample_repeated_patterns(ws)) {
1594
ret = 1;
1595
goto out;
1596
}
1597
1598
memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1599
1600
for (i = 0; i < ws->sample_size; i++) {
1601
byte = ws->sample[i];
1602
ws->bucket[byte].count++;
1603
}
1604
1605
i = byte_set_size(ws);
1606
if (i < BYTE_SET_THRESHOLD) {
1607
ret = 2;
1608
goto out;
1609
}
1610
1611
i = byte_core_set_size(ws);
1612
if (i <= BYTE_CORE_SET_LOW) {
1613
ret = 3;
1614
goto out;
1615
}
1616
1617
if (i >= BYTE_CORE_SET_HIGH) {
1618
ret = 0;
1619
goto out;
1620
}
1621
1622
i = shannon_entropy(ws);
1623
if (i <= ENTROPY_LVL_ACEPTABLE) {
1624
ret = 4;
1625
goto out;
1626
}
1627
1628
/*
1629
* For the levels below ENTROPY_LVL_HIGH, additional analysis would be
1630
* needed to give green light to compression.
1631
*
1632
* For now just assume that compression at that level is not worth the
1633
* resources because:
1634
*
1635
* 1. it is possible to defrag the data later
1636
*
1637
* 2. the data would turn out to be hardly compressible, eg. 150 byte
1638
* values, every bucket has counter at level ~54. The heuristic would
1639
* be confused. This can happen when data have some internal repeated
1640
* patterns like "abbacbbc...". This can be detected by analyzing
1641
* pairs of bytes, which is too costly.
1642
*/
1643
if (i < ENTROPY_LVL_HIGH) {
1644
ret = 5;
1645
goto out;
1646
} else {
1647
ret = 0;
1648
goto out;
1649
}
1650
1651
out:
1652
put_workspace(fs_info, 0, ws_list);
1653
return ret;
1654
}
1655
1656
/*
1657
* Convert the compression suffix (eg. after "zlib" starting with ":") to level.
1658
*
1659
* If the resulting level exceeds the algo's supported levels, it will be clamped.
1660
*
1661
* Return <0 if no valid string can be found.
1662
* Return 0 if everything is fine.
1663
*/
1664
int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
1665
{
1666
int level = 0;
1667
int ret;
1668
1669
if (!type) {
1670
*level_ret = btrfs_compress_set_level(type, level);
1671
return 0;
1672
}
1673
1674
if (str[0] == ':') {
1675
ret = kstrtoint(str + 1, 10, &level);
1676
if (ret)
1677
return ret;
1678
}
1679
1680
*level_ret = btrfs_compress_set_level(type, level);
1681
return 0;
1682
}
1683
1684