Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Kitware
GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmliblzma/liblzma/lz/lz_encoder_mf.c
3156 views
1
// SPDX-License-Identifier: 0BSD
2
3
///////////////////////////////////////////////////////////////////////////////
4
//
5
/// \file lz_encoder_mf.c
6
/// \brief Match finders
7
///
8
// Authors: Igor Pavlov
9
// Lasse Collin
10
//
11
///////////////////////////////////////////////////////////////////////////////
12
13
#include "lz_encoder.h"
14
#include "lz_encoder_hash.h"
15
#include "memcmplen.h"
16
17
18
/// \brief Find matches starting from the current byte
19
///
20
/// \return The length of the longest match found
21
extern uint32_t
22
lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches)
23
{
24
// Call the match finder. It returns the number of length-distance
25
// pairs found.
26
// FIXME: Minimum count is zero, what _exactly_ is the maximum?
27
const uint32_t count = mf->find(mf, matches);
28
29
// Length of the longest match; assume that no matches were found
30
// and thus the maximum length is zero.
31
uint32_t len_best = 0;
32
33
if (count > 0) {
34
#ifndef NDEBUG
35
// Validate the matches.
36
for (uint32_t i = 0; i < count; ++i) {
37
assert(matches[i].len <= mf->nice_len);
38
assert(matches[i].dist < mf->read_pos);
39
assert(memcmp(mf_ptr(mf) - 1,
40
mf_ptr(mf) - matches[i].dist - 2,
41
matches[i].len) == 0);
42
}
43
#endif
44
45
// The last used element in the array contains
46
// the longest match.
47
len_best = matches[count - 1].len;
48
49
// If a match of maximum search length was found, try to
50
// extend the match to maximum possible length.
51
if (len_best == mf->nice_len) {
52
// The limit for the match length is either the
53
// maximum match length supported by the LZ-based
54
// encoder or the number of bytes left in the
55
// dictionary, whichever is smaller.
56
uint32_t limit = mf_avail(mf) + 1;
57
if (limit > mf->match_len_max)
58
limit = mf->match_len_max;
59
60
// Pointer to the byte we just ran through
61
// the match finder.
62
const uint8_t *p1 = mf_ptr(mf) - 1;
63
64
// Pointer to the beginning of the match. We need -1
65
// here because the match distances are zero based.
66
const uint8_t *p2 = p1 - matches[count - 1].dist - 1;
67
68
len_best = lzma_memcmplen(p1, p2, len_best, limit);
69
}
70
}
71
72
*count_ptr = count;
73
74
// Finally update the read position to indicate that match finder was
75
// run for this dictionary offset.
76
++mf->read_ahead;
77
78
return len_best;
79
}
80
81
82
/// Hash value to indicate unused element in the hash. Since we start the
83
/// positions from dict_size + 1, zero is always too far to qualify
84
/// as usable match position.
85
#define EMPTY_HASH_VALUE 0
86
87
88
/// Normalization must be done when lzma_mf.offset + lzma_mf.read_pos
89
/// reaches MUST_NORMALIZE_POS.
90
#define MUST_NORMALIZE_POS UINT32_MAX
91
92
93
/// \brief Normalizes hash values
94
///
95
/// The hash arrays store positions of match candidates. The positions are
96
/// relative to an arbitrary offset that is not the same as the absolute
97
/// offset in the input stream. The relative position of the current byte
98
/// is lzma_mf.offset + lzma_mf.read_pos. The distances of the matches are
99
/// the differences of the current read position and the position found from
100
/// the hash.
101
///
102
/// To prevent integer overflows of the offsets stored in the hash arrays,
103
/// we need to "normalize" the stored values now and then. During the
104
/// normalization, we drop values that indicate distance greater than the
105
/// dictionary size, thus making space for new values.
106
static void
107
normalize(lzma_mf *mf)
108
{
109
assert(mf->read_pos + mf->offset == MUST_NORMALIZE_POS);
110
111
// In future we may not want to touch the lowest bits, because there
112
// may be match finders that use larger resolution than one byte.
113
const uint32_t subvalue
114
= (MUST_NORMALIZE_POS - mf->cyclic_size);
115
// & ~((UINT32_C(1) << 10) - 1);
116
117
for (uint32_t i = 0; i < mf->hash_count; ++i) {
118
// If the distance is greater than the dictionary size,
119
// we can simply mark the hash element as empty.
120
if (mf->hash[i] <= subvalue)
121
mf->hash[i] = EMPTY_HASH_VALUE;
122
else
123
mf->hash[i] -= subvalue;
124
}
125
126
for (uint32_t i = 0; i < mf->sons_count; ++i) {
127
// Do the same for mf->son.
128
//
129
// NOTE: There may be uninitialized elements in mf->son.
130
// Valgrind may complain that the "if" below depends on
131
// an uninitialized value. In this case it is safe to ignore
132
// the warning. See also the comments in lz_encoder_init()
133
// in lz_encoder.c.
134
if (mf->son[i] <= subvalue)
135
mf->son[i] = EMPTY_HASH_VALUE;
136
else
137
mf->son[i] -= subvalue;
138
}
139
140
// Update offset to match the new locations.
141
mf->offset -= subvalue;
142
143
return;
144
}
145
146
147
/// Mark the current byte as processed from point of view of the match finder.
148
static void
149
move_pos(lzma_mf *mf)
150
{
151
if (++mf->cyclic_pos == mf->cyclic_size)
152
mf->cyclic_pos = 0;
153
154
++mf->read_pos;
155
assert(mf->read_pos <= mf->write_pos);
156
157
if (unlikely(mf->read_pos + mf->offset == UINT32_MAX))
158
normalize(mf);
159
}
160
161
162
/// When flushing, we cannot run the match finder unless there is nice_len
163
/// bytes available in the dictionary. Instead, we skip running the match
164
/// finder (indicating that no match was found), and count how many bytes we
165
/// have ignored this way.
166
///
167
/// When new data is given after the flushing was completed, the match finder
168
/// is restarted by rewinding mf->read_pos backwards by mf->pending. Then
169
/// the missed bytes are added to the hash using the match finder's skip
170
/// function (with small amount of input, it may start using mf->pending
171
/// again if flushing).
172
///
173
/// Due to this rewinding, we don't touch cyclic_pos or test for
174
/// normalization. It will be done when the match finder's skip function
175
/// catches up after a flush.
176
static void
177
move_pending(lzma_mf *mf)
178
{
179
++mf->read_pos;
180
assert(mf->read_pos <= mf->write_pos);
181
++mf->pending;
182
}
183
184
185
/// Calculate len_limit and determine if there is enough input to run
186
/// the actual match finder code. Sets up "cur" and "pos". This macro
187
/// is used by all find functions and binary tree skip functions. Hash
188
/// chain skip function doesn't need len_limit so a simpler code is used
189
/// in them.
190
#define header(is_bt, len_min, ret_op) \
191
uint32_t len_limit = mf_avail(mf); \
192
if (mf->nice_len <= len_limit) { \
193
len_limit = mf->nice_len; \
194
} else if (len_limit < (len_min) \
195
|| (is_bt && mf->action == LZMA_SYNC_FLUSH)) { \
196
assert(mf->action != LZMA_RUN); \
197
move_pending(mf); \
198
ret_op; \
199
} \
200
const uint8_t *cur = mf_ptr(mf); \
201
const uint32_t pos = mf->read_pos + mf->offset
202
203
204
/// Header for find functions. "return 0" indicates that zero matches
205
/// were found.
206
#define header_find(is_bt, len_min) \
207
header(is_bt, len_min, return 0); \
208
uint32_t matches_count = 0
209
210
211
/// Header for a loop in a skip function. "continue" tells to skip the rest
212
/// of the code in the loop.
213
#define header_skip(is_bt, len_min) \
214
header(is_bt, len_min, continue)
215
216
217
/// Calls hc_find_func() or bt_find_func() and calculates the total number
218
/// of matches found. Updates the dictionary position and returns the number
219
/// of matches found.
220
#define call_find(func, len_best) \
221
do { \
222
matches_count = (uint32_t)(func(len_limit, pos, cur, cur_match, \
223
mf->depth, mf->son, \
224
mf->cyclic_pos, mf->cyclic_size, \
225
matches + matches_count, len_best) \
226
- matches); \
227
move_pos(mf); \
228
return matches_count; \
229
} while (0)
230
231
232
////////////////
233
// Hash Chain //
234
////////////////
235
236
#if defined(HAVE_MF_HC3) || defined(HAVE_MF_HC4)
237
///
238
///
239
/// \param len_limit Don't look for matches longer than len_limit.
240
/// \param pos lzma_mf.read_pos + lzma_mf.offset
241
/// \param cur Pointer to current byte (mf_ptr(mf))
242
/// \param cur_match Start position of the current match candidate
243
/// \param depth Maximum length of the hash chain
244
/// \param son lzma_mf.son (contains the hash chain)
245
/// \param cyclic_pos lzma_mf.cyclic_pos
246
/// \param cyclic_size lzma_mf_cyclic_size
247
/// \param matches Array to hold the matches.
248
/// \param len_best The length of the longest match found so far.
249
static lzma_match *
250
hc_find_func(
251
const uint32_t len_limit,
252
const uint32_t pos,
253
const uint8_t *const cur,
254
uint32_t cur_match,
255
uint32_t depth,
256
uint32_t *const son,
257
const uint32_t cyclic_pos,
258
const uint32_t cyclic_size,
259
lzma_match *matches,
260
uint32_t len_best)
261
{
262
son[cyclic_pos] = cur_match;
263
264
while (true) {
265
const uint32_t delta = pos - cur_match;
266
if (depth-- == 0 || delta >= cyclic_size)
267
return matches;
268
269
const uint8_t *const pb = cur - delta;
270
cur_match = son[cyclic_pos - delta
271
+ (delta > cyclic_pos ? cyclic_size : 0)];
272
273
if (pb[len_best] == cur[len_best] && pb[0] == cur[0]) {
274
uint32_t len = lzma_memcmplen(pb, cur, 1, len_limit);
275
276
if (len_best < len) {
277
len_best = len;
278
matches->len = len;
279
matches->dist = delta - 1;
280
++matches;
281
282
if (len == len_limit)
283
return matches;
284
}
285
}
286
}
287
}
288
289
290
#define hc_find(len_best) \
291
call_find(hc_find_func, len_best)
292
293
294
#define hc_skip() \
295
do { \
296
mf->son[mf->cyclic_pos] = cur_match; \
297
move_pos(mf); \
298
} while (0)
299
300
#endif
301
302
303
#ifdef HAVE_MF_HC3
304
extern uint32_t
305
lzma_mf_hc3_find(lzma_mf *mf, lzma_match *matches)
306
{
307
header_find(false, 3);
308
309
hash_3_calc();
310
311
const uint32_t delta2 = pos - mf->hash[hash_2_value];
312
const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
313
314
mf->hash[hash_2_value] = pos;
315
mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
316
317
uint32_t len_best = 2;
318
319
if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
320
len_best = lzma_memcmplen(cur - delta2, cur,
321
len_best, len_limit);
322
323
matches[0].len = len_best;
324
matches[0].dist = delta2 - 1;
325
matches_count = 1;
326
327
if (len_best == len_limit) {
328
hc_skip();
329
return 1; // matches_count
330
}
331
}
332
333
hc_find(len_best);
334
}
335
336
337
extern void
338
lzma_mf_hc3_skip(lzma_mf *mf, uint32_t amount)
339
{
340
do {
341
if (mf_avail(mf) < 3) {
342
move_pending(mf);
343
continue;
344
}
345
346
const uint8_t *cur = mf_ptr(mf);
347
const uint32_t pos = mf->read_pos + mf->offset;
348
349
hash_3_calc();
350
351
const uint32_t cur_match
352
= mf->hash[FIX_3_HASH_SIZE + hash_value];
353
354
mf->hash[hash_2_value] = pos;
355
mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
356
357
hc_skip();
358
359
} while (--amount != 0);
360
}
361
#endif
362
363
364
#ifdef HAVE_MF_HC4
365
extern uint32_t
366
lzma_mf_hc4_find(lzma_mf *mf, lzma_match *matches)
367
{
368
header_find(false, 4);
369
370
hash_4_calc();
371
372
uint32_t delta2 = pos - mf->hash[hash_2_value];
373
const uint32_t delta3
374
= pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
375
const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
376
377
mf->hash[hash_2_value ] = pos;
378
mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
379
mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
380
381
uint32_t len_best = 1;
382
383
if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
384
len_best = 2;
385
matches[0].len = 2;
386
matches[0].dist = delta2 - 1;
387
matches_count = 1;
388
}
389
390
if (delta2 != delta3 && delta3 < mf->cyclic_size
391
&& *(cur - delta3) == *cur) {
392
len_best = 3;
393
matches[matches_count++].dist = delta3 - 1;
394
delta2 = delta3;
395
}
396
397
if (matches_count != 0) {
398
len_best = lzma_memcmplen(cur - delta2, cur,
399
len_best, len_limit);
400
401
matches[matches_count - 1].len = len_best;
402
403
if (len_best == len_limit) {
404
hc_skip();
405
return matches_count;
406
}
407
}
408
409
if (len_best < 3)
410
len_best = 3;
411
412
hc_find(len_best);
413
}
414
415
416
extern void
417
lzma_mf_hc4_skip(lzma_mf *mf, uint32_t amount)
418
{
419
do {
420
if (mf_avail(mf) < 4) {
421
move_pending(mf);
422
continue;
423
}
424
425
const uint8_t *cur = mf_ptr(mf);
426
const uint32_t pos = mf->read_pos + mf->offset;
427
428
hash_4_calc();
429
430
const uint32_t cur_match
431
= mf->hash[FIX_4_HASH_SIZE + hash_value];
432
433
mf->hash[hash_2_value] = pos;
434
mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
435
mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
436
437
hc_skip();
438
439
} while (--amount != 0);
440
}
441
#endif
442
443
444
/////////////////
445
// Binary Tree //
446
/////////////////
447
448
#if defined(HAVE_MF_BT2) || defined(HAVE_MF_BT3) || defined(HAVE_MF_BT4)
449
static lzma_match *
450
bt_find_func(
451
const uint32_t len_limit,
452
const uint32_t pos,
453
const uint8_t *const cur,
454
uint32_t cur_match,
455
uint32_t depth,
456
uint32_t *const son,
457
const uint32_t cyclic_pos,
458
const uint32_t cyclic_size,
459
lzma_match *matches,
460
uint32_t len_best)
461
{
462
uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
463
uint32_t *ptr1 = son + (cyclic_pos << 1);
464
465
uint32_t len0 = 0;
466
uint32_t len1 = 0;
467
468
while (true) {
469
const uint32_t delta = pos - cur_match;
470
if (depth-- == 0 || delta >= cyclic_size) {
471
*ptr0 = EMPTY_HASH_VALUE;
472
*ptr1 = EMPTY_HASH_VALUE;
473
return matches;
474
}
475
476
uint32_t *const pair = son + ((cyclic_pos - delta
477
+ (delta > cyclic_pos ? cyclic_size : 0))
478
<< 1);
479
480
const uint8_t *const pb = cur - delta;
481
uint32_t len = my_min(len0, len1);
482
483
if (pb[len] == cur[len]) {
484
len = lzma_memcmplen(pb, cur, len + 1, len_limit);
485
486
if (len_best < len) {
487
len_best = len;
488
matches->len = len;
489
matches->dist = delta - 1;
490
++matches;
491
492
if (len == len_limit) {
493
*ptr1 = pair[0];
494
*ptr0 = pair[1];
495
return matches;
496
}
497
}
498
}
499
500
if (pb[len] < cur[len]) {
501
*ptr1 = cur_match;
502
ptr1 = pair + 1;
503
cur_match = *ptr1;
504
len1 = len;
505
} else {
506
*ptr0 = cur_match;
507
ptr0 = pair;
508
cur_match = *ptr0;
509
len0 = len;
510
}
511
}
512
}
513
514
515
static void
516
bt_skip_func(
517
const uint32_t len_limit,
518
const uint32_t pos,
519
const uint8_t *const cur,
520
uint32_t cur_match,
521
uint32_t depth,
522
uint32_t *const son,
523
const uint32_t cyclic_pos,
524
const uint32_t cyclic_size)
525
{
526
uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
527
uint32_t *ptr1 = son + (cyclic_pos << 1);
528
529
uint32_t len0 = 0;
530
uint32_t len1 = 0;
531
532
while (true) {
533
const uint32_t delta = pos - cur_match;
534
if (depth-- == 0 || delta >= cyclic_size) {
535
*ptr0 = EMPTY_HASH_VALUE;
536
*ptr1 = EMPTY_HASH_VALUE;
537
return;
538
}
539
540
uint32_t *pair = son + ((cyclic_pos - delta
541
+ (delta > cyclic_pos ? cyclic_size : 0))
542
<< 1);
543
const uint8_t *pb = cur - delta;
544
uint32_t len = my_min(len0, len1);
545
546
if (pb[len] == cur[len]) {
547
len = lzma_memcmplen(pb, cur, len + 1, len_limit);
548
549
if (len == len_limit) {
550
*ptr1 = pair[0];
551
*ptr0 = pair[1];
552
return;
553
}
554
}
555
556
if (pb[len] < cur[len]) {
557
*ptr1 = cur_match;
558
ptr1 = pair + 1;
559
cur_match = *ptr1;
560
len1 = len;
561
} else {
562
*ptr0 = cur_match;
563
ptr0 = pair;
564
cur_match = *ptr0;
565
len0 = len;
566
}
567
}
568
}
569
570
571
#define bt_find(len_best) \
572
call_find(bt_find_func, len_best)
573
574
#define bt_skip() \
575
do { \
576
bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, \
577
mf->son, mf->cyclic_pos, \
578
mf->cyclic_size); \
579
move_pos(mf); \
580
} while (0)
581
582
#endif
583
584
585
#ifdef HAVE_MF_BT2
586
extern uint32_t
587
lzma_mf_bt2_find(lzma_mf *mf, lzma_match *matches)
588
{
589
header_find(true, 2);
590
591
hash_2_calc();
592
593
const uint32_t cur_match = mf->hash[hash_value];
594
mf->hash[hash_value] = pos;
595
596
bt_find(1);
597
}
598
599
600
extern void
601
lzma_mf_bt2_skip(lzma_mf *mf, uint32_t amount)
602
{
603
do {
604
header_skip(true, 2);
605
606
hash_2_calc();
607
608
const uint32_t cur_match = mf->hash[hash_value];
609
mf->hash[hash_value] = pos;
610
611
bt_skip();
612
613
} while (--amount != 0);
614
}
615
#endif
616
617
618
#ifdef HAVE_MF_BT3
619
extern uint32_t
620
lzma_mf_bt3_find(lzma_mf *mf, lzma_match *matches)
621
{
622
header_find(true, 3);
623
624
hash_3_calc();
625
626
const uint32_t delta2 = pos - mf->hash[hash_2_value];
627
const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
628
629
mf->hash[hash_2_value] = pos;
630
mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
631
632
uint32_t len_best = 2;
633
634
if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
635
len_best = lzma_memcmplen(
636
cur, cur - delta2, len_best, len_limit);
637
638
matches[0].len = len_best;
639
matches[0].dist = delta2 - 1;
640
matches_count = 1;
641
642
if (len_best == len_limit) {
643
bt_skip();
644
return 1; // matches_count
645
}
646
}
647
648
bt_find(len_best);
649
}
650
651
652
extern void
653
lzma_mf_bt3_skip(lzma_mf *mf, uint32_t amount)
654
{
655
do {
656
header_skip(true, 3);
657
658
hash_3_calc();
659
660
const uint32_t cur_match
661
= mf->hash[FIX_3_HASH_SIZE + hash_value];
662
663
mf->hash[hash_2_value] = pos;
664
mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
665
666
bt_skip();
667
668
} while (--amount != 0);
669
}
670
#endif
671
672
673
#ifdef HAVE_MF_BT4
674
extern uint32_t
675
lzma_mf_bt4_find(lzma_mf *mf, lzma_match *matches)
676
{
677
header_find(true, 4);
678
679
hash_4_calc();
680
681
uint32_t delta2 = pos - mf->hash[hash_2_value];
682
const uint32_t delta3
683
= pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
684
const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
685
686
mf->hash[hash_2_value] = pos;
687
mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
688
mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
689
690
uint32_t len_best = 1;
691
692
if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
693
len_best = 2;
694
matches[0].len = 2;
695
matches[0].dist = delta2 - 1;
696
matches_count = 1;
697
}
698
699
if (delta2 != delta3 && delta3 < mf->cyclic_size
700
&& *(cur - delta3) == *cur) {
701
len_best = 3;
702
matches[matches_count++].dist = delta3 - 1;
703
delta2 = delta3;
704
}
705
706
if (matches_count != 0) {
707
len_best = lzma_memcmplen(
708
cur, cur - delta2, len_best, len_limit);
709
710
matches[matches_count - 1].len = len_best;
711
712
if (len_best == len_limit) {
713
bt_skip();
714
return matches_count;
715
}
716
}
717
718
if (len_best < 3)
719
len_best = 3;
720
721
bt_find(len_best);
722
}
723
724
725
extern void
726
lzma_mf_bt4_skip(lzma_mf *mf, uint32_t amount)
727
{
728
do {
729
header_skip(true, 4);
730
731
hash_4_calc();
732
733
const uint32_t cur_match
734
= mf->hash[FIX_4_HASH_SIZE + hash_value];
735
736
mf->hash[hash_2_value] = pos;
737
mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
738
mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
739
740
bt_skip();
741
742
} while (--amount != 0);
743
}
744
#endif
745
746