Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/libdivsufsort/lib/divsufsort.c
39478 views
1
/*
2
* divsufsort.c for libdivsufsort
3
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
4
*
5
* Permission is hereby granted, free of charge, to any person
6
* obtaining a copy of this software and associated documentation
7
* files (the "Software"), to deal in the Software without
8
* restriction, including without limitation the rights to use,
9
* copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the
11
* Software is furnished to do so, subject to the following
12
* conditions:
13
*
14
* The above copyright notice and this permission notice shall be
15
* included in all copies or substantial portions of the Software.
16
*
17
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
* OTHER DEALINGS IN THE SOFTWARE.
25
*/
26
27
#include "divsufsort_private.h"
28
#ifdef _OPENMP
29
# include <omp.h>
30
#endif
31
32
33
/*- Private Functions -*/
34
35
/* Sorts suffixes of type B*. */
36
static
37
saidx_t
38
sort_typeBstar(const sauchar_t *T, saidx_t *SA,
39
saidx_t *bucket_A, saidx_t *bucket_B,
40
saidx_t n) {
41
saidx_t *PAb, *ISAb, *buf;
42
#ifdef _OPENMP
43
saidx_t *curbuf;
44
saidx_t l;
45
#endif
46
saidx_t i, j, k, t, m, bufsize;
47
saint_t c0, c1;
48
#ifdef _OPENMP
49
saint_t d0, d1;
50
int tmp;
51
#endif
52
53
/* Initialize bucket arrays. */
54
for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
55
for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
56
57
/* Count the number of occurrences of the first one or two characters of each
58
type A, B and B* suffix. Moreover, store the beginning position of all
59
type B* suffixes into the array SA. */
60
for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
61
/* type A suffix. */
62
do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
63
if(0 <= i) {
64
/* type B* suffix. */
65
++BUCKET_BSTAR(c0, c1);
66
SA[--m] = i;
67
/* type B suffix. */
68
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
69
++BUCKET_B(c0, c1);
70
}
71
}
72
}
73
m = n - m;
74
/*
75
note:
76
A type B* suffix is lexicographically smaller than a type B suffix that
77
begins with the same first two characters.
78
*/
79
80
/* Calculate the index of start/end point of each bucket. */
81
for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
82
t = i + BUCKET_A(c0);
83
BUCKET_A(c0) = i + j; /* start point */
84
i = t + BUCKET_B(c0, c0);
85
for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
86
j += BUCKET_BSTAR(c0, c1);
87
BUCKET_BSTAR(c0, c1) = j; /* end point */
88
i += BUCKET_B(c0, c1);
89
}
90
}
91
92
if(0 < m) {
93
/* Sort the type B* suffixes by their first two characters. */
94
PAb = SA + n - m; ISAb = SA + m;
95
for(i = m - 2; 0 <= i; --i) {
96
t = PAb[i], c0 = T[t], c1 = T[t + 1];
97
SA[--BUCKET_BSTAR(c0, c1)] = i;
98
}
99
t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
100
SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
101
102
/* Sort the type B* substrings using sssort. */
103
#ifdef _OPENMP
104
tmp = omp_get_max_threads();
105
buf = SA + m, bufsize = (n - (2 * m)) / tmp;
106
c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
107
#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)
108
{
109
tmp = omp_get_thread_num();
110
curbuf = buf + tmp * bufsize;
111
k = 0;
112
for(;;) {
113
#pragma omp critical(sssort_lock)
114
{
115
if(0 < (l = j)) {
116
d0 = c0, d1 = c1;
117
do {
118
k = BUCKET_BSTAR(d0, d1);
119
if(--d1 <= d0) {
120
d1 = ALPHABET_SIZE - 1;
121
if(--d0 < 0) { break; }
122
}
123
} while(((l - k) <= 1) && (0 < (l = k)));
124
c0 = d0, c1 = d1, j = k;
125
}
126
}
127
if(l == 0) { break; }
128
sssort(T, PAb, SA + k, SA + l,
129
curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
130
}
131
}
132
#else
133
buf = SA + m, bufsize = n - (2 * m);
134
for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
135
for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
136
i = BUCKET_BSTAR(c0, c1);
137
if(1 < (j - i)) {
138
sssort(T, PAb, SA + i, SA + j,
139
buf, bufsize, 2, n, *(SA + i) == (m - 1));
140
}
141
}
142
}
143
#endif
144
145
/* Compute ranks of type B* substrings. */
146
for(i = m - 1; 0 <= i; --i) {
147
if(0 <= SA[i]) {
148
j = i;
149
do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
150
SA[i + 1] = i - j;
151
if(i <= 0) { break; }
152
}
153
j = i;
154
do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
155
ISAb[SA[i]] = j;
156
}
157
158
/* Construct the inverse suffix array of type B* suffixes using trsort. */
159
trsort(ISAb, SA, m, 1);
160
161
/* Set the sorted order of tyoe B* suffixes. */
162
for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
163
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
164
if(0 <= i) {
165
t = i;
166
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
167
SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
168
}
169
}
170
171
/* Calculate the index of start/end point of each bucket. */
172
BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
173
for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
174
i = BUCKET_A(c0 + 1) - 1;
175
for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
176
t = i - BUCKET_B(c0, c1);
177
BUCKET_B(c0, c1) = i; /* end point */
178
179
/* Move all type B* suffixes to the correct position. */
180
for(i = t, j = BUCKET_BSTAR(c0, c1);
181
j <= k;
182
--i, --k) { SA[i] = SA[k]; }
183
}
184
BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
185
BUCKET_B(c0, c0) = i; /* end point */
186
}
187
}
188
189
return m;
190
}
191
192
/* Constructs the suffix array by using the sorted order of type B* suffixes. */
193
static
194
void
195
construct_SA(const sauchar_t *T, saidx_t *SA,
196
saidx_t *bucket_A, saidx_t *bucket_B,
197
saidx_t n, saidx_t m) {
198
saidx_t *i, *j, *k;
199
saidx_t s;
200
saint_t c0, c1, c2;
201
202
if(0 < m) {
203
/* Construct the sorted order of type B suffixes by using
204
the sorted order of type B* suffixes. */
205
for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
206
/* Scan the suffix array from right to left. */
207
for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
208
j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
209
i <= j;
210
--j) {
211
if(0 < (s = *j)) {
212
assert(T[s] == c1);
213
assert(((s + 1) < n) && (T[s] <= T[s + 1]));
214
assert(T[s - 1] <= T[s]);
215
*j = ~s;
216
c0 = T[--s];
217
if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
218
if(c0 != c2) {
219
if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
220
k = SA + BUCKET_B(c2 = c0, c1);
221
}
222
assert(k < j);
223
*k-- = s;
224
} else {
225
assert(((s == 0) && (T[s] == c1)) || (s < 0));
226
*j = ~s;
227
}
228
}
229
}
230
}
231
232
/* Construct the suffix array by using
233
the sorted order of type B suffixes. */
234
k = SA + BUCKET_A(c2 = T[n - 1]);
235
*k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
236
/* Scan the suffix array from left to right. */
237
for(i = SA, j = SA + n; i < j; ++i) {
238
if(0 < (s = *i)) {
239
assert(T[s - 1] >= T[s]);
240
c0 = T[--s];
241
if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
242
if(c0 != c2) {
243
BUCKET_A(c2) = k - SA;
244
k = SA + BUCKET_A(c2 = c0);
245
}
246
assert(i < k);
247
*k++ = s;
248
} else {
249
assert(s < 0);
250
*i = ~s;
251
}
252
}
253
}
254
255
/* Constructs the burrows-wheeler transformed string directly
256
by using the sorted order of type B* suffixes. */
257
static
258
saidx_t
259
construct_BWT(const sauchar_t *T, saidx_t *SA,
260
saidx_t *bucket_A, saidx_t *bucket_B,
261
saidx_t n, saidx_t m) {
262
saidx_t *i, *j, *k, *orig;
263
saidx_t s;
264
saint_t c0, c1, c2;
265
266
if(0 < m) {
267
/* Construct the sorted order of type B suffixes by using
268
the sorted order of type B* suffixes. */
269
for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
270
/* Scan the suffix array from right to left. */
271
for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
272
j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
273
i <= j;
274
--j) {
275
if(0 < (s = *j)) {
276
assert(T[s] == c1);
277
assert(((s + 1) < n) && (T[s] <= T[s + 1]));
278
assert(T[s - 1] <= T[s]);
279
c0 = T[--s];
280
*j = ~((saidx_t)c0);
281
if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
282
if(c0 != c2) {
283
if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
284
k = SA + BUCKET_B(c2 = c0, c1);
285
}
286
assert(k < j);
287
*k-- = s;
288
} else if(s != 0) {
289
*j = ~s;
290
#ifndef NDEBUG
291
} else {
292
assert(T[s] == c1);
293
#endif
294
}
295
}
296
}
297
}
298
299
/* Construct the BWTed string by using
300
the sorted order of type B suffixes. */
301
k = SA + BUCKET_A(c2 = T[n - 1]);
302
*k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1);
303
/* Scan the suffix array from left to right. */
304
for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
305
if(0 < (s = *i)) {
306
assert(T[s - 1] >= T[s]);
307
c0 = T[--s];
308
*i = c0;
309
if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); }
310
if(c0 != c2) {
311
BUCKET_A(c2) = k - SA;
312
k = SA + BUCKET_A(c2 = c0);
313
}
314
assert(i < k);
315
*k++ = s;
316
} else if(s != 0) {
317
*i = ~s;
318
} else {
319
orig = i;
320
}
321
}
322
323
return orig - SA;
324
}
325
326
327
/*---------------------------------------------------------------------------*/
328
329
/*- Function -*/
330
331
saint_t
332
divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) {
333
saidx_t *bucket_A, *bucket_B;
334
saidx_t m;
335
saint_t err = 0;
336
337
/* Check arguments. */
338
if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
339
else if(n == 0) { return 0; }
340
else if(n == 1) { SA[0] = 0; return 0; }
341
else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
342
343
bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
344
bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
345
346
/* Suffixsort. */
347
if((bucket_A != NULL) && (bucket_B != NULL)) {
348
m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
349
construct_SA(T, SA, bucket_A, bucket_B, n, m);
350
} else {
351
err = -2;
352
}
353
354
free(bucket_B);
355
free(bucket_A);
356
357
return err;
358
}
359
360
saidx_t
361
divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) {
362
saidx_t *B;
363
saidx_t *bucket_A, *bucket_B;
364
saidx_t m, pidx, i;
365
366
/* Check arguments. */
367
if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
368
else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
369
370
if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); }
371
bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
372
bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
373
374
/* Burrows-Wheeler Transform. */
375
if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
376
m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
377
pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
378
379
/* Copy to output string. */
380
U[0] = T[n - 1];
381
for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; }
382
for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; }
383
pidx += 1;
384
} else {
385
pidx = -2;
386
}
387
388
free(bucket_B);
389
free(bucket_A);
390
if(A == NULL) { free(B); }
391
392
return pidx;
393
}
394
395
const char *
396
divsufsort_version(void) {
397
return PROJECT_VERSION_FULL;
398
}
399
400