Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/compression.c
26285 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2008 Oracle. All rights reserved.
4
*/
5
6
#include <linux/kernel.h>
7
#include <linux/bio.h>
8
#include <linux/file.h>
9
#include <linux/fs.h>
10
#include <linux/pagemap.h>
11
#include <linux/pagevec.h>
12
#include <linux/highmem.h>
13
#include <linux/kthread.h>
14
#include <linux/time.h>
15
#include <linux/init.h>
16
#include <linux/string.h>
17
#include <linux/backing-dev.h>
18
#include <linux/writeback.h>
19
#include <linux/psi.h>
20
#include <linux/slab.h>
21
#include <linux/sched/mm.h>
22
#include <linux/log2.h>
23
#include <linux/shrinker.h>
24
#include <crypto/hash.h>
25
#include "misc.h"
26
#include "ctree.h"
27
#include "fs.h"
28
#include "btrfs_inode.h"
29
#include "bio.h"
30
#include "ordered-data.h"
31
#include "compression.h"
32
#include "extent_io.h"
33
#include "extent_map.h"
34
#include "subpage.h"
35
#include "messages.h"
36
#include "super.h"
37
38
static struct bio_set btrfs_compressed_bioset;
39
40
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
41
42
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
43
{
44
switch (type) {
45
case BTRFS_COMPRESS_ZLIB:
46
case BTRFS_COMPRESS_LZO:
47
case BTRFS_COMPRESS_ZSTD:
48
case BTRFS_COMPRESS_NONE:
49
return btrfs_compress_types[type];
50
default:
51
break;
52
}
53
54
return NULL;
55
}
56
57
static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
58
{
59
return container_of(bbio, struct compressed_bio, bbio);
60
}
61
62
static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
63
u64 start, blk_opf_t op,
64
btrfs_bio_end_io_t end_io)
65
{
66
struct btrfs_bio *bbio;
67
68
bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
69
GFP_NOFS, &btrfs_compressed_bioset));
70
btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
71
bbio->inode = inode;
72
bbio->file_offset = start;
73
return to_compressed_bio(bbio);
74
}
75
76
bool btrfs_compress_is_valid_type(const char *str, size_t len)
77
{
78
int i;
79
80
for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
81
size_t comp_len = strlen(btrfs_compress_types[i]);
82
83
if (len < comp_len)
84
continue;
85
86
if (!strncmp(btrfs_compress_types[i], str, comp_len))
87
return true;
88
}
89
return false;
90
}
91
92
static int compression_compress_pages(int type, struct list_head *ws,
93
struct address_space *mapping, u64 start,
94
struct folio **folios, unsigned long *out_folios,
95
unsigned long *total_in, unsigned long *total_out)
96
{
97
switch (type) {
98
case BTRFS_COMPRESS_ZLIB:
99
return zlib_compress_folios(ws, mapping, start, folios,
100
out_folios, total_in, total_out);
101
case BTRFS_COMPRESS_LZO:
102
return lzo_compress_folios(ws, mapping, start, folios,
103
out_folios, total_in, total_out);
104
case BTRFS_COMPRESS_ZSTD:
105
return zstd_compress_folios(ws, mapping, start, folios,
106
out_folios, total_in, total_out);
107
case BTRFS_COMPRESS_NONE:
108
default:
109
/*
110
* This can happen when compression races with remount setting
111
* it to 'no compress', while caller doesn't call
112
* inode_need_compress() to check if we really need to
113
* compress.
114
*
115
* Not a big deal, just need to inform caller that we
116
* haven't allocated any pages yet.
117
*/
118
*out_folios = 0;
119
return -E2BIG;
120
}
121
}
122
123
static int compression_decompress_bio(struct list_head *ws,
124
struct compressed_bio *cb)
125
{
126
switch (cb->compress_type) {
127
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
128
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
129
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
130
case BTRFS_COMPRESS_NONE:
131
default:
132
/*
133
* This can't happen, the type is validated several times
134
* before we get here.
135
*/
136
BUG();
137
}
138
}
139
140
static int compression_decompress(int type, struct list_head *ws,
141
const u8 *data_in, struct folio *dest_folio,
142
unsigned long dest_pgoff, size_t srclen, size_t destlen)
143
{
144
switch (type) {
145
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio,
146
dest_pgoff, srclen, destlen);
147
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio,
148
dest_pgoff, srclen, destlen);
149
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio,
150
dest_pgoff, srclen, destlen);
151
case BTRFS_COMPRESS_NONE:
152
default:
153
/*
154
* This can't happen, the type is validated several times
155
* before we get here.
156
*/
157
BUG();
158
}
159
}
160
161
static void btrfs_free_compressed_folios(struct compressed_bio *cb)
162
{
163
for (unsigned int i = 0; i < cb->nr_folios; i++)
164
btrfs_free_compr_folio(cb->compressed_folios[i]);
165
kfree(cb->compressed_folios);
166
}
167
168
static int btrfs_decompress_bio(struct compressed_bio *cb);
169
170
/*
171
* Global cache of last unused pages for compression/decompression.
172
*/
173
static struct btrfs_compr_pool {
174
struct shrinker *shrinker;
175
spinlock_t lock;
176
struct list_head list;
177
int count;
178
int thresh;
179
} compr_pool;
180
181
static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
182
{
183
int ret;
184
185
/*
186
* We must not read the values more than once if 'ret' gets expanded in
187
* the return statement so we don't accidentally return a negative
188
* number, even if the first condition finds it positive.
189
*/
190
ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
191
192
return ret > 0 ? ret : 0;
193
}
194
195
static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
196
{
197
struct list_head remove;
198
struct list_head *tmp, *next;
199
int freed;
200
201
if (compr_pool.count == 0)
202
return SHRINK_STOP;
203
204
INIT_LIST_HEAD(&remove);
205
206
/* For now, just simply drain the whole list. */
207
spin_lock(&compr_pool.lock);
208
list_splice_init(&compr_pool.list, &remove);
209
freed = compr_pool.count;
210
compr_pool.count = 0;
211
spin_unlock(&compr_pool.lock);
212
213
list_for_each_safe(tmp, next, &remove) {
214
struct page *page = list_entry(tmp, struct page, lru);
215
216
ASSERT(page_ref_count(page) == 1);
217
put_page(page);
218
}
219
220
return freed;
221
}
222
223
/*
224
* Common wrappers for page allocation from compression wrappers
225
*/
226
struct folio *btrfs_alloc_compr_folio(void)
227
{
228
struct folio *folio = NULL;
229
230
spin_lock(&compr_pool.lock);
231
if (compr_pool.count > 0) {
232
folio = list_first_entry(&compr_pool.list, struct folio, lru);
233
list_del_init(&folio->lru);
234
compr_pool.count--;
235
}
236
spin_unlock(&compr_pool.lock);
237
238
if (folio)
239
return folio;
240
241
return folio_alloc(GFP_NOFS, 0);
242
}
243
244
void btrfs_free_compr_folio(struct folio *folio)
245
{
246
bool do_free = false;
247
248
spin_lock(&compr_pool.lock);
249
if (compr_pool.count > compr_pool.thresh) {
250
do_free = true;
251
} else {
252
list_add(&folio->lru, &compr_pool.list);
253
compr_pool.count++;
254
}
255
spin_unlock(&compr_pool.lock);
256
257
if (!do_free)
258
return;
259
260
ASSERT(folio_ref_count(folio) == 1);
261
folio_put(folio);
262
}
263
264
static void end_bbio_compressed_read(struct btrfs_bio *bbio)
265
{
266
struct compressed_bio *cb = to_compressed_bio(bbio);
267
blk_status_t status = bbio->bio.bi_status;
268
269
if (!status)
270
status = errno_to_blk_status(btrfs_decompress_bio(cb));
271
272
btrfs_free_compressed_folios(cb);
273
btrfs_bio_end_io(cb->orig_bbio, status);
274
bio_put(&bbio->bio);
275
}
276
277
/*
278
* Clear the writeback bits on all of the file
279
* pages for a compressed write
280
*/
281
static noinline void end_compressed_writeback(const struct compressed_bio *cb)
282
{
283
struct inode *inode = &cb->bbio.inode->vfs_inode;
284
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
285
pgoff_t index = cb->start >> PAGE_SHIFT;
286
const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
287
struct folio_batch fbatch;
288
int i;
289
int ret;
290
291
ret = blk_status_to_errno(cb->bbio.bio.bi_status);
292
if (ret)
293
mapping_set_error(inode->i_mapping, ret);
294
295
folio_batch_init(&fbatch);
296
while (index <= end_index) {
297
ret = filemap_get_folios(inode->i_mapping, &index, end_index,
298
&fbatch);
299
300
if (ret == 0)
301
return;
302
303
for (i = 0; i < ret; i++) {
304
struct folio *folio = fbatch.folios[i];
305
306
btrfs_folio_clamp_clear_writeback(fs_info, folio,
307
cb->start, cb->len);
308
}
309
folio_batch_release(&fbatch);
310
}
311
/* the inode may be gone now */
312
}
313
314
static void btrfs_finish_compressed_write_work(struct work_struct *work)
315
{
316
struct compressed_bio *cb =
317
container_of(work, struct compressed_bio, write_end_work);
318
319
btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
320
cb->bbio.bio.bi_status == BLK_STS_OK);
321
322
if (cb->writeback)
323
end_compressed_writeback(cb);
324
/* Note, our inode could be gone now */
325
326
btrfs_free_compressed_folios(cb);
327
bio_put(&cb->bbio.bio);
328
}
329
330
/*
331
* Do the cleanup once all the compressed pages hit the disk. This will clear
332
* writeback on the file pages and free the compressed pages.
333
*
334
* This also calls the writeback end hooks for the file pages so that metadata
335
* and checksums can be updated in the file.
336
*/
337
static void end_bbio_compressed_write(struct btrfs_bio *bbio)
338
{
339
struct compressed_bio *cb = to_compressed_bio(bbio);
340
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
341
342
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
343
}
344
345
static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
346
{
347
struct bio *bio = &cb->bbio.bio;
348
u32 offset = 0;
349
350
while (offset < cb->compressed_len) {
351
int ret;
352
u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
353
354
/* Maximum compressed extent is smaller than bio size limit. */
355
ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
356
len, 0);
357
ASSERT(ret);
358
offset += len;
359
}
360
}
361
362
/*
363
* worker function to build and submit bios for previously compressed pages.
364
* The corresponding pages in the inode should be marked for writeback
365
* and the compressed pages should have a reference on them for dropping
366
* when the IO is complete.
367
*
368
* This also checksums the file bytes and gets things ready for
369
* the end io hooks.
370
*/
371
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
372
struct folio **compressed_folios,
373
unsigned int nr_folios,
374
blk_opf_t write_flags,
375
bool writeback)
376
{
377
struct btrfs_inode *inode = ordered->inode;
378
struct btrfs_fs_info *fs_info = inode->root->fs_info;
379
struct compressed_bio *cb;
380
381
ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize));
382
ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize));
383
384
cb = alloc_compressed_bio(inode, ordered->file_offset,
385
REQ_OP_WRITE | write_flags,
386
end_bbio_compressed_write);
387
cb->start = ordered->file_offset;
388
cb->len = ordered->num_bytes;
389
cb->compressed_folios = compressed_folios;
390
cb->compressed_len = ordered->disk_num_bytes;
391
cb->writeback = writeback;
392
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
393
cb->nr_folios = nr_folios;
394
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
395
cb->bbio.ordered = ordered;
396
btrfs_add_compressed_bio_folios(cb);
397
398
btrfs_submit_bbio(&cb->bbio, 0);
399
}
400
401
/*
402
* Add extra pages in the same compressed file extent so that we don't need to
403
* re-read the same extent again and again.
404
*
405
* NOTE: this won't work well for subpage, as for subpage read, we lock the
406
* full page then submit bio for each compressed/regular extents.
407
*
408
* This means, if we have several sectors in the same page points to the same
409
* on-disk compressed data, we will re-read the same extent many times and
410
* this function can only help for the next page.
411
*/
412
static noinline int add_ra_bio_pages(struct inode *inode,
413
u64 compressed_end,
414
struct compressed_bio *cb,
415
int *memstall, unsigned long *pflags)
416
{
417
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
418
pgoff_t end_index;
419
struct bio *orig_bio = &cb->orig_bbio->bio;
420
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
421
u64 isize = i_size_read(inode);
422
int ret;
423
struct folio *folio;
424
struct extent_map *em;
425
struct address_space *mapping = inode->i_mapping;
426
struct extent_map_tree *em_tree;
427
struct extent_io_tree *tree;
428
int sectors_missed = 0;
429
430
em_tree = &BTRFS_I(inode)->extent_tree;
431
tree = &BTRFS_I(inode)->io_tree;
432
433
if (isize == 0)
434
return 0;
435
436
/*
437
* For current subpage support, we only support 64K page size,
438
* which means maximum compressed extent size (128K) is just 2x page
439
* size.
440
* This makes readahead less effective, so here disable readahead for
441
* subpage for now, until full compressed write is supported.
442
*/
443
if (fs_info->sectorsize < PAGE_SIZE)
444
return 0;
445
446
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
447
448
while (cur < compressed_end) {
449
pgoff_t page_end;
450
pgoff_t pg_index = cur >> PAGE_SHIFT;
451
u32 add_size;
452
453
if (pg_index > end_index)
454
break;
455
456
folio = filemap_get_folio(mapping, pg_index);
457
if (!IS_ERR(folio)) {
458
u64 folio_sz = folio_size(folio);
459
u64 offset = offset_in_folio(folio, cur);
460
461
folio_put(folio);
462
sectors_missed += (folio_sz - offset) >>
463
fs_info->sectorsize_bits;
464
465
/* Beyond threshold, no need to continue */
466
if (sectors_missed > 4)
467
break;
468
469
/*
470
* Jump to next page start as we already have page for
471
* current offset.
472
*/
473
cur += (folio_sz - offset);
474
continue;
475
}
476
477
folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
478
~__GFP_FS), 0);
479
if (!folio)
480
break;
481
482
if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) {
483
/* There is already a page, skip to page end */
484
cur += folio_size(folio);
485
folio_put(folio);
486
continue;
487
}
488
489
if (!*memstall && folio_test_workingset(folio)) {
490
psi_memstall_enter(pflags);
491
*memstall = 1;
492
}
493
494
ret = set_folio_extent_mapped(folio);
495
if (ret < 0) {
496
folio_unlock(folio);
497
folio_put(folio);
498
break;
499
}
500
501
page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
502
btrfs_lock_extent(tree, cur, page_end, NULL);
503
read_lock(&em_tree->lock);
504
em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
505
read_unlock(&em_tree->lock);
506
507
/*
508
* At this point, we have a locked page in the page cache for
509
* these bytes in the file. But, we have to make sure they map
510
* to this compressed extent on disk.
511
*/
512
if (!em || cur < em->start ||
513
(cur + fs_info->sectorsize > btrfs_extent_map_end(em)) ||
514
(btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
515
orig_bio->bi_iter.bi_sector) {
516
btrfs_free_extent_map(em);
517
btrfs_unlock_extent(tree, cur, page_end, NULL);
518
folio_unlock(folio);
519
folio_put(folio);
520
break;
521
}
522
add_size = min(em->start + em->len, page_end + 1) - cur;
523
btrfs_free_extent_map(em);
524
btrfs_unlock_extent(tree, cur, page_end, NULL);
525
526
if (folio_contains(folio, end_index)) {
527
size_t zero_offset = offset_in_folio(folio, isize);
528
529
if (zero_offset) {
530
int zeros;
531
zeros = folio_size(folio) - zero_offset;
532
folio_zero_range(folio, zero_offset, zeros);
533
}
534
}
535
536
if (!bio_add_folio(orig_bio, folio, add_size,
537
offset_in_folio(folio, cur))) {
538
folio_unlock(folio);
539
folio_put(folio);
540
break;
541
}
542
/*
543
* If it's subpage, we also need to increase its
544
* subpage::readers number, as at endio we will decrease
545
* subpage::readers and to unlock the page.
546
*/
547
if (fs_info->sectorsize < PAGE_SIZE)
548
btrfs_folio_set_lock(fs_info, folio, cur, add_size);
549
folio_put(folio);
550
cur += add_size;
551
}
552
return 0;
553
}
554
555
/*
556
* for a compressed read, the bio we get passed has all the inode pages
557
* in it. We don't actually do IO on those pages but allocate new ones
558
* to hold the compressed pages on disk.
559
*
560
* bio->bi_iter.bi_sector points to the compressed extent on disk
561
* bio->bi_io_vec points to all of the inode pages
562
*
563
* After the compressed pages are read, we copy the bytes into the
564
* bio we were passed and then call the bio end_io calls
565
*/
566
void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
567
{
568
struct btrfs_inode *inode = bbio->inode;
569
struct btrfs_fs_info *fs_info = inode->root->fs_info;
570
struct extent_map_tree *em_tree = &inode->extent_tree;
571
struct compressed_bio *cb;
572
unsigned int compressed_len;
573
u64 file_offset = bbio->file_offset;
574
u64 em_len;
575
u64 em_start;
576
struct extent_map *em;
577
unsigned long pflags;
578
int memstall = 0;
579
blk_status_t status;
580
int ret;
581
582
/* we need the actual starting offset of this extent in the file */
583
read_lock(&em_tree->lock);
584
em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
585
read_unlock(&em_tree->lock);
586
if (!em) {
587
status = BLK_STS_IOERR;
588
goto out;
589
}
590
591
ASSERT(btrfs_extent_map_is_compressed(em));
592
compressed_len = em->disk_num_bytes;
593
594
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
595
end_bbio_compressed_read);
596
597
cb->start = em->start - em->offset;
598
em_len = em->len;
599
em_start = em->start;
600
601
cb->len = bbio->bio.bi_iter.bi_size;
602
cb->compressed_len = compressed_len;
603
cb->compress_type = btrfs_extent_map_compression(em);
604
cb->orig_bbio = bbio;
605
606
btrfs_free_extent_map(em);
607
608
cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
609
cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
610
if (!cb->compressed_folios) {
611
status = BLK_STS_RESOURCE;
612
goto out_free_bio;
613
}
614
615
ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
616
if (ret) {
617
status = BLK_STS_RESOURCE;
618
goto out_free_compressed_pages;
619
}
620
621
add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
622
&pflags);
623
624
/* include any pages we added in add_ra-bio_pages */
625
cb->len = bbio->bio.bi_iter.bi_size;
626
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
627
btrfs_add_compressed_bio_folios(cb);
628
629
if (memstall)
630
psi_memstall_leave(&pflags);
631
632
btrfs_submit_bbio(&cb->bbio, 0);
633
return;
634
635
out_free_compressed_pages:
636
kfree(cb->compressed_folios);
637
out_free_bio:
638
bio_put(&cb->bbio.bio);
639
out:
640
btrfs_bio_end_io(bbio, status);
641
}
642
643
/*
644
* Heuristic uses systematic sampling to collect data from the input data
645
* range, the logic can be tuned by the following constants:
646
*
647
* @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
648
* @SAMPLING_INTERVAL - range from which the sampled data can be collected
649
*/
650
#define SAMPLING_READ_SIZE (16)
651
#define SAMPLING_INTERVAL (256)
652
653
/*
654
* For statistical analysis of the input data we consider bytes that form a
655
* Galois Field of 256 objects. Each object has an attribute count, ie. how
656
* many times the object appeared in the sample.
657
*/
658
#define BUCKET_SIZE (256)
659
660
/*
661
* The size of the sample is based on a statistical sampling rule of thumb.
662
* The common way is to perform sampling tests as long as the number of
663
* elements in each cell is at least 5.
664
*
665
* Instead of 5, we choose 32 to obtain more accurate results.
666
* If the data contain the maximum number of symbols, which is 256, we obtain a
667
* sample size bound by 8192.
668
*
669
* For a sample of at most 8KB of data per data range: 16 consecutive bytes
670
* from up to 512 locations.
671
*/
672
#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
673
SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
674
675
struct bucket_item {
676
u32 count;
677
};
678
679
struct heuristic_ws {
680
/* Partial copy of input data */
681
u8 *sample;
682
u32 sample_size;
683
/* Buckets store counters for each byte value */
684
struct bucket_item *bucket;
685
/* Sorting buffer */
686
struct bucket_item *bucket_b;
687
struct list_head list;
688
};
689
690
static struct workspace_manager heuristic_wsm;
691
692
static void free_heuristic_ws(struct list_head *ws)
693
{
694
struct heuristic_ws *workspace;
695
696
workspace = list_entry(ws, struct heuristic_ws, list);
697
698
kvfree(workspace->sample);
699
kfree(workspace->bucket);
700
kfree(workspace->bucket_b);
701
kfree(workspace);
702
}
703
704
static struct list_head *alloc_heuristic_ws(void)
705
{
706
struct heuristic_ws *ws;
707
708
ws = kzalloc(sizeof(*ws), GFP_KERNEL);
709
if (!ws)
710
return ERR_PTR(-ENOMEM);
711
712
ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
713
if (!ws->sample)
714
goto fail;
715
716
ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
717
if (!ws->bucket)
718
goto fail;
719
720
ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
721
if (!ws->bucket_b)
722
goto fail;
723
724
INIT_LIST_HEAD(&ws->list);
725
return &ws->list;
726
fail:
727
free_heuristic_ws(&ws->list);
728
return ERR_PTR(-ENOMEM);
729
}
730
731
const struct btrfs_compress_op btrfs_heuristic_compress = {
732
.workspace_manager = &heuristic_wsm,
733
};
734
735
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
736
/* The heuristic is represented as compression type 0 */
737
&btrfs_heuristic_compress,
738
&btrfs_zlib_compress,
739
&btrfs_lzo_compress,
740
&btrfs_zstd_compress,
741
};
742
743
static struct list_head *alloc_workspace(int type, int level)
744
{
745
switch (type) {
746
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
747
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
748
case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
749
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
750
default:
751
/*
752
* This can't happen, the type is validated several times
753
* before we get here.
754
*/
755
BUG();
756
}
757
}
758
759
static void free_workspace(int type, struct list_head *ws)
760
{
761
switch (type) {
762
case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
763
case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
764
case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
765
case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
766
default:
767
/*
768
* This can't happen, the type is validated several times
769
* before we get here.
770
*/
771
BUG();
772
}
773
}
774
775
static void btrfs_init_workspace_manager(int type)
776
{
777
struct workspace_manager *wsm;
778
struct list_head *workspace;
779
780
wsm = btrfs_compress_op[type]->workspace_manager;
781
INIT_LIST_HEAD(&wsm->idle_ws);
782
spin_lock_init(&wsm->ws_lock);
783
atomic_set(&wsm->total_ws, 0);
784
init_waitqueue_head(&wsm->ws_wait);
785
786
/*
787
* Preallocate one workspace for each compression type so we can
788
* guarantee forward progress in the worst case
789
*/
790
workspace = alloc_workspace(type, 0);
791
if (IS_ERR(workspace)) {
792
btrfs_warn(NULL,
793
"cannot preallocate compression workspace, will try later");
794
} else {
795
atomic_set(&wsm->total_ws, 1);
796
wsm->free_ws = 1;
797
list_add(workspace, &wsm->idle_ws);
798
}
799
}
800
801
static void btrfs_cleanup_workspace_manager(int type)
802
{
803
struct workspace_manager *wsman;
804
struct list_head *ws;
805
806
wsman = btrfs_compress_op[type]->workspace_manager;
807
while (!list_empty(&wsman->idle_ws)) {
808
ws = wsman->idle_ws.next;
809
list_del(ws);
810
free_workspace(type, ws);
811
atomic_dec(&wsman->total_ws);
812
}
813
}
814
815
/*
816
* This finds an available workspace or allocates a new one.
817
* If it's not possible to allocate a new one, waits until there's one.
818
* Preallocation makes a forward progress guarantees and we do not return
819
* errors.
820
*/
821
struct list_head *btrfs_get_workspace(int type, int level)
822
{
823
struct workspace_manager *wsm;
824
struct list_head *workspace;
825
int cpus = num_online_cpus();
826
unsigned nofs_flag;
827
struct list_head *idle_ws;
828
spinlock_t *ws_lock;
829
atomic_t *total_ws;
830
wait_queue_head_t *ws_wait;
831
int *free_ws;
832
833
wsm = btrfs_compress_op[type]->workspace_manager;
834
idle_ws = &wsm->idle_ws;
835
ws_lock = &wsm->ws_lock;
836
total_ws = &wsm->total_ws;
837
ws_wait = &wsm->ws_wait;
838
free_ws = &wsm->free_ws;
839
840
again:
841
spin_lock(ws_lock);
842
if (!list_empty(idle_ws)) {
843
workspace = idle_ws->next;
844
list_del(workspace);
845
(*free_ws)--;
846
spin_unlock(ws_lock);
847
return workspace;
848
849
}
850
if (atomic_read(total_ws) > cpus) {
851
DEFINE_WAIT(wait);
852
853
spin_unlock(ws_lock);
854
prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
855
if (atomic_read(total_ws) > cpus && !*free_ws)
856
schedule();
857
finish_wait(ws_wait, &wait);
858
goto again;
859
}
860
atomic_inc(total_ws);
861
spin_unlock(ws_lock);
862
863
/*
864
* Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
865
* to turn it off here because we might get called from the restricted
866
* context of btrfs_compress_bio/btrfs_compress_pages
867
*/
868
nofs_flag = memalloc_nofs_save();
869
workspace = alloc_workspace(type, level);
870
memalloc_nofs_restore(nofs_flag);
871
872
if (IS_ERR(workspace)) {
873
atomic_dec(total_ws);
874
wake_up(ws_wait);
875
876
/*
877
* Do not return the error but go back to waiting. There's a
878
* workspace preallocated for each type and the compression
879
* time is bounded so we get to a workspace eventually. This
880
* makes our caller's life easier.
881
*
882
* To prevent silent and low-probability deadlocks (when the
883
* initial preallocation fails), check if there are any
884
* workspaces at all.
885
*/
886
if (atomic_read(total_ws) == 0) {
887
static DEFINE_RATELIMIT_STATE(_rs,
888
/* once per minute */ 60 * HZ,
889
/* no burst */ 1);
890
891
if (__ratelimit(&_rs))
892
btrfs_warn(NULL,
893
"no compression workspaces, low memory, retrying");
894
}
895
goto again;
896
}
897
return workspace;
898
}
899
900
static struct list_head *get_workspace(int type, int level)
901
{
902
switch (type) {
903
case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
904
case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
905
case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
906
case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
907
default:
908
/*
909
* This can't happen, the type is validated several times
910
* before we get here.
911
*/
912
BUG();
913
}
914
}
915
916
/*
917
* put a workspace struct back on the list or free it if we have enough
918
* idle ones sitting around
919
*/
920
void btrfs_put_workspace(int type, struct list_head *ws)
921
{
922
struct workspace_manager *wsm;
923
struct list_head *idle_ws;
924
spinlock_t *ws_lock;
925
atomic_t *total_ws;
926
wait_queue_head_t *ws_wait;
927
int *free_ws;
928
929
wsm = btrfs_compress_op[type]->workspace_manager;
930
idle_ws = &wsm->idle_ws;
931
ws_lock = &wsm->ws_lock;
932
total_ws = &wsm->total_ws;
933
ws_wait = &wsm->ws_wait;
934
free_ws = &wsm->free_ws;
935
936
spin_lock(ws_lock);
937
if (*free_ws <= num_online_cpus()) {
938
list_add(ws, idle_ws);
939
(*free_ws)++;
940
spin_unlock(ws_lock);
941
goto wake;
942
}
943
spin_unlock(ws_lock);
944
945
free_workspace(type, ws);
946
atomic_dec(total_ws);
947
wake:
948
cond_wake_up(ws_wait);
949
}
950
951
static void put_workspace(int type, struct list_head *ws)
952
{
953
switch (type) {
954
case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
955
case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
956
case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
957
case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
958
default:
959
/*
960
* This can't happen, the type is validated several times
961
* before we get here.
962
*/
963
BUG();
964
}
965
}
966
967
/*
968
* Adjust @level according to the limits of the compression algorithm or
969
* fallback to default
970
*/
971
static int btrfs_compress_set_level(unsigned int type, int level)
972
{
973
const struct btrfs_compress_op *ops = btrfs_compress_op[type];
974
975
if (level == 0)
976
level = ops->default_level;
977
else
978
level = clamp(level, ops->min_level, ops->max_level);
979
980
return level;
981
}
982
983
/*
984
* Check whether the @level is within the valid range for the given type.
985
*/
986
bool btrfs_compress_level_valid(unsigned int type, int level)
987
{
988
const struct btrfs_compress_op *ops = btrfs_compress_op[type];
989
990
return ops->min_level <= level && level <= ops->max_level;
991
}
992
993
/* Wrapper around find_get_page(), with extra error message. */
994
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
995
struct folio **in_folio_ret)
996
{
997
struct folio *in_folio;
998
999
/*
1000
* The compressed write path should have the folio locked already, thus
1001
* we only need to grab one reference.
1002
*/
1003
in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT);
1004
if (IS_ERR(in_folio)) {
1005
struct btrfs_inode *inode = BTRFS_I(mapping->host);
1006
1007
btrfs_crit(inode->root->fs_info,
1008
"failed to get page cache, root %lld ino %llu file offset %llu",
1009
btrfs_root_id(inode->root), btrfs_ino(inode), start);
1010
return -ENOENT;
1011
}
1012
*in_folio_ret = in_folio;
1013
return 0;
1014
}
1015
1016
/*
1017
* Given an address space and start and length, compress the bytes into @pages
1018
* that are allocated on demand.
1019
*
1020
* @type_level is encoded algorithm and level, where level 0 means whatever
1021
* default the algorithm chooses and is opaque here;
1022
* - compression algo are 0-3
1023
* - the level are bits 4-7
1024
*
1025
* @out_pages is an in/out parameter, holds maximum number of pages to allocate
1026
* and returns number of actually allocated pages
1027
*
1028
* @total_in is used to return the number of bytes actually read. It
1029
* may be smaller than the input length if we had to exit early because we
1030
* ran out of room in the pages array or because we cross the
1031
* max_out threshold.
1032
*
1033
* @total_out is an in/out parameter, must be set to the input length and will
1034
* be also used to return the total number of compressed bytes
1035
*/
1036
int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
1037
u64 start, struct folio **folios, unsigned long *out_folios,
1038
unsigned long *total_in, unsigned long *total_out)
1039
{
1040
const unsigned long orig_len = *total_out;
1041
struct list_head *workspace;
1042
int ret;
1043
1044
level = btrfs_compress_set_level(type, level);
1045
workspace = get_workspace(type, level);
1046
ret = compression_compress_pages(type, workspace, mapping, start, folios,
1047
out_folios, total_in, total_out);
1048
/* The total read-in bytes should be no larger than the input. */
1049
ASSERT(*total_in <= orig_len);
1050
put_workspace(type, workspace);
1051
return ret;
1052
}
1053
1054
static int btrfs_decompress_bio(struct compressed_bio *cb)
1055
{
1056
struct list_head *workspace;
1057
int ret;
1058
int type = cb->compress_type;
1059
1060
workspace = get_workspace(type, 0);
1061
ret = compression_decompress_bio(workspace, cb);
1062
put_workspace(type, workspace);
1063
1064
if (!ret)
1065
zero_fill_bio(&cb->orig_bbio->bio);
1066
return ret;
1067
}
1068
1069
/*
1070
* a less complex decompression routine. Our compressed data fits in a
1071
* single page, and we want to read a single page out of it.
1072
* start_byte tells us the offset into the compressed data we're interested in
1073
*/
1074
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
1075
unsigned long dest_pgoff, size_t srclen, size_t destlen)
1076
{
1077
struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
1078
struct list_head *workspace;
1079
const u32 sectorsize = fs_info->sectorsize;
1080
int ret;
1081
1082
/*
1083
* The full destination page range should not exceed the page size.
1084
* And the @destlen should not exceed sectorsize, as this is only called for
1085
* inline file extents, which should not exceed sectorsize.
1086
*/
1087
ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
1088
1089
workspace = get_workspace(type, 0);
1090
ret = compression_decompress(type, workspace, data_in, dest_folio,
1091
dest_pgoff, srclen, destlen);
1092
put_workspace(type, workspace);
1093
1094
return ret;
1095
}
1096
1097
int __init btrfs_init_compress(void)
1098
{
1099
if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
1100
offsetof(struct compressed_bio, bbio.bio),
1101
BIOSET_NEED_BVECS))
1102
return -ENOMEM;
1103
1104
compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
1105
if (!compr_pool.shrinker)
1106
return -ENOMEM;
1107
1108
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
1109
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
1110
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
1111
zstd_init_workspace_manager();
1112
1113
spin_lock_init(&compr_pool.lock);
1114
INIT_LIST_HEAD(&compr_pool.list);
1115
compr_pool.count = 0;
1116
/* 128K / 4K = 32, for 8 threads is 256 pages. */
1117
compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
1118
compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
1119
compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
1120
compr_pool.shrinker->batch = 32;
1121
compr_pool.shrinker->seeks = DEFAULT_SEEKS;
1122
shrinker_register(compr_pool.shrinker);
1123
1124
return 0;
1125
}
1126
1127
void __cold btrfs_exit_compress(void)
1128
{
1129
/* For now scan drains all pages and does not touch the parameters. */
1130
btrfs_compr_pool_scan(NULL, NULL);
1131
shrinker_free(compr_pool.shrinker);
1132
1133
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
1134
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
1135
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
1136
zstd_cleanup_workspace_manager();
1137
bioset_exit(&btrfs_compressed_bioset);
1138
}
1139
1140
/*
1141
* The bvec is a single page bvec from a bio that contains folios from a filemap.
1142
*
1143
* Since the folio may be a large one, and if the bv_page is not a head page of
1144
* a large folio, then page->index is unreliable.
1145
*
1146
* Thus we need this helper to grab the proper file offset.
1147
*/
1148
static u64 file_offset_from_bvec(const struct bio_vec *bvec)
1149
{
1150
const struct page *page = bvec->bv_page;
1151
const struct folio *folio = page_folio(page);
1152
1153
return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
1154
}
1155
1156
/*
1157
* Copy decompressed data from working buffer to pages.
1158
*
1159
* @buf: The decompressed data buffer
1160
* @buf_len: The decompressed data length
1161
* @decompressed: Number of bytes that are already decompressed inside the
1162
* compressed extent
1163
* @cb: The compressed extent descriptor
1164
* @orig_bio: The original bio that the caller wants to read for
1165
*
1166
* An easier to understand graph is like below:
1167
*
1168
* |<- orig_bio ->| |<- orig_bio->|
1169
* |<------- full decompressed extent ----->|
1170
* |<----------- @cb range ---->|
1171
* | |<-- @buf_len -->|
1172
* |<--- @decompressed --->|
1173
*
1174
* Note that, @cb can be a subpage of the full decompressed extent, but
1175
* @cb->start always has the same as the orig_file_offset value of the full
1176
* decompressed extent.
1177
*
1178
* When reading compressed extent, we have to read the full compressed extent,
1179
* while @orig_bio may only want part of the range.
1180
* Thus this function will ensure only data covered by @orig_bio will be copied
1181
* to.
1182
*
1183
* Return 0 if we have copied all needed contents for @orig_bio.
1184
* Return >0 if we need continue decompress.
1185
*/
1186
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
1187
struct compressed_bio *cb, u32 decompressed)
1188
{
1189
struct bio *orig_bio = &cb->orig_bbio->bio;
1190
/* Offset inside the full decompressed extent */
1191
u32 cur_offset;
1192
1193
cur_offset = decompressed;
1194
/* The main loop to do the copy */
1195
while (cur_offset < decompressed + buf_len) {
1196
struct bio_vec bvec;
1197
size_t copy_len;
1198
u32 copy_start;
1199
/* Offset inside the full decompressed extent */
1200
u32 bvec_offset;
1201
void *kaddr;
1202
1203
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
1204
/*
1205
* cb->start may underflow, but subtracting that value can still
1206
* give us correct offset inside the full decompressed extent.
1207
*/
1208
bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
1209
1210
/* Haven't reached the bvec range, exit */
1211
if (decompressed + buf_len <= bvec_offset)
1212
return 1;
1213
1214
copy_start = max(cur_offset, bvec_offset);
1215
copy_len = min(bvec_offset + bvec.bv_len,
1216
decompressed + buf_len) - copy_start;
1217
ASSERT(copy_len);
1218
1219
/*
1220
* Extra range check to ensure we didn't go beyond
1221
* @buf + @buf_len.
1222
*/
1223
ASSERT(copy_start - decompressed < buf_len);
1224
1225
kaddr = bvec_kmap_local(&bvec);
1226
memcpy(kaddr, buf + copy_start - decompressed, copy_len);
1227
kunmap_local(kaddr);
1228
1229
cur_offset += copy_len;
1230
bio_advance(orig_bio, copy_len);
1231
/* Finished the bio */
1232
if (!orig_bio->bi_iter.bi_size)
1233
return 0;
1234
}
1235
return 1;
1236
}
1237
1238
/*
1239
* Shannon Entropy calculation
1240
*
1241
* Pure byte distribution analysis fails to determine compressibility of data.
1242
* Try calculating entropy to estimate the average minimum number of bits
1243
* needed to encode the sampled data.
1244
*
1245
* For convenience, return the percentage of needed bits, instead of amount of
1246
* bits directly.
1247
*
1248
* @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
1249
* and can be compressible with high probability
1250
*
1251
* @ENTROPY_LVL_HIGH - data are not compressible with high probability
1252
*
1253
* Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
1254
*/
1255
#define ENTROPY_LVL_ACEPTABLE (65)
1256
#define ENTROPY_LVL_HIGH (80)
1257
1258
/*
1259
* For increasead precision in shannon_entropy calculation,
1260
* let's do pow(n, M) to save more digits after comma:
1261
*
1262
* - maximum int bit length is 64
1263
* - ilog2(MAX_SAMPLE_SIZE) -> 13
1264
* - 13 * 4 = 52 < 64 -> M = 4
1265
*
1266
* So use pow(n, 4).
1267
*/
1268
static inline u32 ilog2_w(u64 n)
1269
{
1270
return ilog2(n * n * n * n);
1271
}
1272
1273
static u32 shannon_entropy(struct heuristic_ws *ws)
1274
{
1275
const u32 entropy_max = 8 * ilog2_w(2);
1276
u32 entropy_sum = 0;
1277
u32 p, p_base, sz_base;
1278
u32 i;
1279
1280
sz_base = ilog2_w(ws->sample_size);
1281
for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
1282
p = ws->bucket[i].count;
1283
p_base = ilog2_w(p);
1284
entropy_sum += p * (sz_base - p_base);
1285
}
1286
1287
entropy_sum /= ws->sample_size;
1288
return entropy_sum * 100 / entropy_max;
1289
}
1290
1291
#define RADIX_BASE 4U
1292
#define COUNTERS_SIZE (1U << RADIX_BASE)
1293
1294
static u8 get4bits(u64 num, int shift) {
1295
u8 low4bits;
1296
1297
num >>= shift;
1298
/* Reverse order */
1299
low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
1300
return low4bits;
1301
}
1302
1303
/*
1304
* Use 4 bits as radix base
1305
* Use 16 u32 counters for calculating new position in buf array
1306
*
1307
* @array - array that will be sorted
1308
* @array_buf - buffer array to store sorting results
1309
* must be equal in size to @array
1310
* @num - array size
1311
*/
1312
static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
1313
int num)
1314
{
1315
u64 max_num;
1316
u64 buf_num;
1317
u32 counters[COUNTERS_SIZE];
1318
u32 new_addr;
1319
u32 addr;
1320
int bitlen;
1321
int shift;
1322
int i;
1323
1324
/*
1325
* Try avoid useless loop iterations for small numbers stored in big
1326
* counters. Example: 48 33 4 ... in 64bit array
1327
*/
1328
max_num = array[0].count;
1329
for (i = 1; i < num; i++) {
1330
buf_num = array[i].count;
1331
if (buf_num > max_num)
1332
max_num = buf_num;
1333
}
1334
1335
buf_num = ilog2(max_num);
1336
bitlen = ALIGN(buf_num, RADIX_BASE * 2);
1337
1338
shift = 0;
1339
while (shift < bitlen) {
1340
memset(counters, 0, sizeof(counters));
1341
1342
for (i = 0; i < num; i++) {
1343
buf_num = array[i].count;
1344
addr = get4bits(buf_num, shift);
1345
counters[addr]++;
1346
}
1347
1348
for (i = 1; i < COUNTERS_SIZE; i++)
1349
counters[i] += counters[i - 1];
1350
1351
for (i = num - 1; i >= 0; i--) {
1352
buf_num = array[i].count;
1353
addr = get4bits(buf_num, shift);
1354
counters[addr]--;
1355
new_addr = counters[addr];
1356
array_buf[new_addr] = array[i];
1357
}
1358
1359
shift += RADIX_BASE;
1360
1361
/*
1362
* Normal radix expects to move data from a temporary array, to
1363
* the main one. But that requires some CPU time. Avoid that
1364
* by doing another sort iteration to original array instead of
1365
* memcpy()
1366
*/
1367
memset(counters, 0, sizeof(counters));
1368
1369
for (i = 0; i < num; i ++) {
1370
buf_num = array_buf[i].count;
1371
addr = get4bits(buf_num, shift);
1372
counters[addr]++;
1373
}
1374
1375
for (i = 1; i < COUNTERS_SIZE; i++)
1376
counters[i] += counters[i - 1];
1377
1378
for (i = num - 1; i >= 0; i--) {
1379
buf_num = array_buf[i].count;
1380
addr = get4bits(buf_num, shift);
1381
counters[addr]--;
1382
new_addr = counters[addr];
1383
array[new_addr] = array_buf[i];
1384
}
1385
1386
shift += RADIX_BASE;
1387
}
1388
}
1389
1390
/*
1391
* Size of the core byte set - how many bytes cover 90% of the sample
1392
*
1393
* There are several types of structured binary data that use nearly all byte
1394
* values. The distribution can be uniform and counts in all buckets will be
1395
* nearly the same (eg. encrypted data). Unlikely to be compressible.
1396
*
1397
* Other possibility is normal (Gaussian) distribution, where the data could
1398
* be potentially compressible, but we have to take a few more steps to decide
1399
* how much.
1400
*
1401
* @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
1402
* compression algo can easy fix that
1403
* @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1404
* probability is not compressible
1405
*/
1406
#define BYTE_CORE_SET_LOW (64)
1407
#define BYTE_CORE_SET_HIGH (200)
1408
1409
static int byte_core_set_size(struct heuristic_ws *ws)
1410
{
1411
u32 i;
1412
u32 coreset_sum = 0;
1413
const u32 core_set_threshold = ws->sample_size * 90 / 100;
1414
struct bucket_item *bucket = ws->bucket;
1415
1416
/* Sort in reverse order */
1417
radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
1418
1419
for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1420
coreset_sum += bucket[i].count;
1421
1422
if (coreset_sum > core_set_threshold)
1423
return i;
1424
1425
for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1426
coreset_sum += bucket[i].count;
1427
if (coreset_sum > core_set_threshold)
1428
break;
1429
}
1430
1431
return i;
1432
}
1433
1434
/*
1435
* Count byte values in buckets.
1436
* This heuristic can detect textual data (configs, xml, json, html, etc).
1437
* Because in most text-like data byte set is restricted to limited number of
1438
* possible characters, and that restriction in most cases makes data easy to
1439
* compress.
1440
*
1441
* @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1442
* less - compressible
1443
* more - need additional analysis
1444
*/
1445
#define BYTE_SET_THRESHOLD (64)
1446
1447
static u32 byte_set_size(const struct heuristic_ws *ws)
1448
{
1449
u32 i;
1450
u32 byte_set_size = 0;
1451
1452
for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1453
if (ws->bucket[i].count > 0)
1454
byte_set_size++;
1455
}
1456
1457
/*
1458
* Continue collecting count of byte values in buckets. If the byte
1459
* set size is bigger then the threshold, it's pointless to continue,
1460
* the detection technique would fail for this type of data.
1461
*/
1462
for (; i < BUCKET_SIZE; i++) {
1463
if (ws->bucket[i].count > 0) {
1464
byte_set_size++;
1465
if (byte_set_size > BYTE_SET_THRESHOLD)
1466
return byte_set_size;
1467
}
1468
}
1469
1470
return byte_set_size;
1471
}
1472
1473
static bool sample_repeated_patterns(struct heuristic_ws *ws)
1474
{
1475
const u32 half_of_sample = ws->sample_size / 2;
1476
const u8 *data = ws->sample;
1477
1478
return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
1479
}
1480
1481
static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1482
struct heuristic_ws *ws)
1483
{
1484
struct page *page;
1485
pgoff_t index, index_end;
1486
u32 i, curr_sample_pos;
1487
u8 *in_data;
1488
1489
/*
1490
* Compression handles the input data by chunks of 128KiB
1491
* (defined by BTRFS_MAX_UNCOMPRESSED)
1492
*
1493
* We do the same for the heuristic and loop over the whole range.
1494
*
1495
* MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1496
* process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1497
*/
1498
if (end - start > BTRFS_MAX_UNCOMPRESSED)
1499
end = start + BTRFS_MAX_UNCOMPRESSED;
1500
1501
index = start >> PAGE_SHIFT;
1502
index_end = end >> PAGE_SHIFT;
1503
1504
/* Don't miss unaligned end */
1505
if (!PAGE_ALIGNED(end))
1506
index_end++;
1507
1508
curr_sample_pos = 0;
1509
while (index < index_end) {
1510
page = find_get_page(inode->i_mapping, index);
1511
in_data = kmap_local_page(page);
1512
/* Handle case where the start is not aligned to PAGE_SIZE */
1513
i = start % PAGE_SIZE;
1514
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1515
/* Don't sample any garbage from the last page */
1516
if (start > end - SAMPLING_READ_SIZE)
1517
break;
1518
memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1519
SAMPLING_READ_SIZE);
1520
i += SAMPLING_INTERVAL;
1521
start += SAMPLING_INTERVAL;
1522
curr_sample_pos += SAMPLING_READ_SIZE;
1523
}
1524
kunmap_local(in_data);
1525
put_page(page);
1526
1527
index++;
1528
}
1529
1530
ws->sample_size = curr_sample_pos;
1531
}
1532
1533
/*
1534
* Compression heuristic.
1535
*
1536
* The following types of analysis can be performed:
1537
* - detect mostly zero data
1538
* - detect data with low "byte set" size (text, etc)
1539
* - detect data with low/high "core byte" set
1540
*
1541
* Return non-zero if the compression should be done, 0 otherwise.
1542
*/
1543
int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
1544
{
1545
struct list_head *ws_list = get_workspace(0, 0);
1546
struct heuristic_ws *ws;
1547
u32 i;
1548
u8 byte;
1549
int ret = 0;
1550
1551
ws = list_entry(ws_list, struct heuristic_ws, list);
1552
1553
heuristic_collect_sample(&inode->vfs_inode, start, end, ws);
1554
1555
if (sample_repeated_patterns(ws)) {
1556
ret = 1;
1557
goto out;
1558
}
1559
1560
memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1561
1562
for (i = 0; i < ws->sample_size; i++) {
1563
byte = ws->sample[i];
1564
ws->bucket[byte].count++;
1565
}
1566
1567
i = byte_set_size(ws);
1568
if (i < BYTE_SET_THRESHOLD) {
1569
ret = 2;
1570
goto out;
1571
}
1572
1573
i = byte_core_set_size(ws);
1574
if (i <= BYTE_CORE_SET_LOW) {
1575
ret = 3;
1576
goto out;
1577
}
1578
1579
if (i >= BYTE_CORE_SET_HIGH) {
1580
ret = 0;
1581
goto out;
1582
}
1583
1584
i = shannon_entropy(ws);
1585
if (i <= ENTROPY_LVL_ACEPTABLE) {
1586
ret = 4;
1587
goto out;
1588
}
1589
1590
/*
1591
* For the levels below ENTROPY_LVL_HIGH, additional analysis would be
1592
* needed to give green light to compression.
1593
*
1594
* For now just assume that compression at that level is not worth the
1595
* resources because:
1596
*
1597
* 1. it is possible to defrag the data later
1598
*
1599
* 2. the data would turn out to be hardly compressible, eg. 150 byte
1600
* values, every bucket has counter at level ~54. The heuristic would
1601
* be confused. This can happen when data have some internal repeated
1602
* patterns like "abbacbbc...". This can be detected by analyzing
1603
* pairs of bytes, which is too costly.
1604
*/
1605
if (i < ENTROPY_LVL_HIGH) {
1606
ret = 5;
1607
goto out;
1608
} else {
1609
ret = 0;
1610
goto out;
1611
}
1612
1613
out:
1614
put_workspace(0, ws_list);
1615
return ret;
1616
}
1617
1618
/*
1619
* Convert the compression suffix (eg. after "zlib" starting with ":") to
1620
* level, unrecognized string will set the default level. Negative level
1621
* numbers are allowed.
1622
*/
1623
int btrfs_compress_str2level(unsigned int type, const char *str)
1624
{
1625
int level = 0;
1626
int ret;
1627
1628
if (!type)
1629
return 0;
1630
1631
if (str[0] == ':') {
1632
ret = kstrtoint(str + 1, 10, &level);
1633
if (ret)
1634
level = 0;
1635
}
1636
1637
level = btrfs_compress_set_level(type, level);
1638
1639
return level;
1640
}
1641
1642