Path: blob/main/contrib/libdivsufsort/lib/divsufsort.c
39478 views
/*1* divsufsort.c for libdivsufsort2* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.3*4* Permission is hereby granted, free of charge, to any person5* obtaining a copy of this software and associated documentation6* files (the "Software"), to deal in the Software without7* restriction, including without limitation the rights to use,8* copy, modify, merge, publish, distribute, sublicense, and/or sell9* copies of the Software, and to permit persons to whom the10* Software is furnished to do so, subject to the following11* conditions:12*13* The above copyright notice and this permission notice shall be14* included in all copies or substantial portions of the Software.15*16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,17* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES18* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND19* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT20* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,21* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING22* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR23* OTHER DEALINGS IN THE SOFTWARE.24*/2526#include "divsufsort_private.h"27#ifdef _OPENMP28# include <omp.h>29#endif303132/*- Private Functions -*/3334/* Sorts suffixes of type B*. */35static36saidx_t37sort_typeBstar(const sauchar_t *T, saidx_t *SA,38saidx_t *bucket_A, saidx_t *bucket_B,39saidx_t n) {40saidx_t *PAb, *ISAb, *buf;41#ifdef _OPENMP42saidx_t *curbuf;43saidx_t l;44#endif45saidx_t i, j, k, t, m, bufsize;46saint_t c0, c1;47#ifdef _OPENMP48saint_t d0, d1;49int tmp;50#endif5152/* Initialize bucket arrays. */53for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }54for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }5556/* Count the number of occurrences of the first one or two characters of each57type A, B and B* suffix. Moreover, store the beginning position of all58type B* suffixes into the array SA. */59for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {60/* type A suffix. */61do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));62if(0 <= i) {63/* type B* suffix. */64++BUCKET_BSTAR(c0, c1);65SA[--m] = i;66/* type B suffix. */67for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {68++BUCKET_B(c0, c1);69}70}71}72m = n - m;73/*74note:75A type B* suffix is lexicographically smaller than a type B suffix that76begins with the same first two characters.77*/7879/* Calculate the index of start/end point of each bucket. */80for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {81t = i + BUCKET_A(c0);82BUCKET_A(c0) = i + j; /* start point */83i = t + BUCKET_B(c0, c0);84for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {85j += BUCKET_BSTAR(c0, c1);86BUCKET_BSTAR(c0, c1) = j; /* end point */87i += BUCKET_B(c0, c1);88}89}9091if(0 < m) {92/* Sort the type B* suffixes by their first two characters. */93PAb = SA + n - m; ISAb = SA + m;94for(i = m - 2; 0 <= i; --i) {95t = PAb[i], c0 = T[t], c1 = T[t + 1];96SA[--BUCKET_BSTAR(c0, c1)] = i;97}98t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];99SA[--BUCKET_BSTAR(c0, c1)] = m - 1;100101/* Sort the type B* substrings using sssort. */102#ifdef _OPENMP103tmp = omp_get_max_threads();104buf = SA + m, bufsize = (n - (2 * m)) / tmp;105c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;106#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)107{108tmp = omp_get_thread_num();109curbuf = buf + tmp * bufsize;110k = 0;111for(;;) {112#pragma omp critical(sssort_lock)113{114if(0 < (l = j)) {115d0 = c0, d1 = c1;116do {117k = BUCKET_BSTAR(d0, d1);118if(--d1 <= d0) {119d1 = ALPHABET_SIZE - 1;120if(--d0 < 0) { break; }121}122} while(((l - k) <= 1) && (0 < (l = k)));123c0 = d0, c1 = d1, j = k;124}125}126if(l == 0) { break; }127sssort(T, PAb, SA + k, SA + l,128curbuf, bufsize, 2, n, *(SA + k) == (m - 1));129}130}131#else132buf = SA + m, bufsize = n - (2 * m);133for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {134for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {135i = BUCKET_BSTAR(c0, c1);136if(1 < (j - i)) {137sssort(T, PAb, SA + i, SA + j,138buf, bufsize, 2, n, *(SA + i) == (m - 1));139}140}141}142#endif143144/* Compute ranks of type B* substrings. */145for(i = m - 1; 0 <= i; --i) {146if(0 <= SA[i]) {147j = i;148do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));149SA[i + 1] = i - j;150if(i <= 0) { break; }151}152j = i;153do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);154ISAb[SA[i]] = j;155}156157/* Construct the inverse suffix array of type B* suffixes using trsort. */158trsort(ISAb, SA, m, 1);159160/* Set the sorted order of tyoe B* suffixes. */161for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {162for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }163if(0 <= i) {164t = i;165for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }166SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;167}168}169170/* Calculate the index of start/end point of each bucket. */171BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */172for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {173i = BUCKET_A(c0 + 1) - 1;174for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {175t = i - BUCKET_B(c0, c1);176BUCKET_B(c0, c1) = i; /* end point */177178/* Move all type B* suffixes to the correct position. */179for(i = t, j = BUCKET_BSTAR(c0, c1);180j <= k;181--i, --k) { SA[i] = SA[k]; }182}183BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */184BUCKET_B(c0, c0) = i; /* end point */185}186}187188return m;189}190191/* Constructs the suffix array by using the sorted order of type B* suffixes. */192static193void194construct_SA(const sauchar_t *T, saidx_t *SA,195saidx_t *bucket_A, saidx_t *bucket_B,196saidx_t n, saidx_t m) {197saidx_t *i, *j, *k;198saidx_t s;199saint_t c0, c1, c2;200201if(0 < m) {202/* Construct the sorted order of type B suffixes by using203the sorted order of type B* suffixes. */204for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {205/* Scan the suffix array from right to left. */206for(i = SA + BUCKET_BSTAR(c1, c1 + 1),207j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;208i <= j;209--j) {210if(0 < (s = *j)) {211assert(T[s] == c1);212assert(((s + 1) < n) && (T[s] <= T[s + 1]));213assert(T[s - 1] <= T[s]);214*j = ~s;215c0 = T[--s];216if((0 < s) && (T[s - 1] > c0)) { s = ~s; }217if(c0 != c2) {218if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }219k = SA + BUCKET_B(c2 = c0, c1);220}221assert(k < j);222*k-- = s;223} else {224assert(((s == 0) && (T[s] == c1)) || (s < 0));225*j = ~s;226}227}228}229}230231/* Construct the suffix array by using232the sorted order of type B suffixes. */233k = SA + BUCKET_A(c2 = T[n - 1]);234*k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);235/* Scan the suffix array from left to right. */236for(i = SA, j = SA + n; i < j; ++i) {237if(0 < (s = *i)) {238assert(T[s - 1] >= T[s]);239c0 = T[--s];240if((s == 0) || (T[s - 1] < c0)) { s = ~s; }241if(c0 != c2) {242BUCKET_A(c2) = k - SA;243k = SA + BUCKET_A(c2 = c0);244}245assert(i < k);246*k++ = s;247} else {248assert(s < 0);249*i = ~s;250}251}252}253254/* Constructs the burrows-wheeler transformed string directly255by using the sorted order of type B* suffixes. */256static257saidx_t258construct_BWT(const sauchar_t *T, saidx_t *SA,259saidx_t *bucket_A, saidx_t *bucket_B,260saidx_t n, saidx_t m) {261saidx_t *i, *j, *k, *orig;262saidx_t s;263saint_t c0, c1, c2;264265if(0 < m) {266/* Construct the sorted order of type B suffixes by using267the sorted order of type B* suffixes. */268for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {269/* Scan the suffix array from right to left. */270for(i = SA + BUCKET_BSTAR(c1, c1 + 1),271j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;272i <= j;273--j) {274if(0 < (s = *j)) {275assert(T[s] == c1);276assert(((s + 1) < n) && (T[s] <= T[s + 1]));277assert(T[s - 1] <= T[s]);278c0 = T[--s];279*j = ~((saidx_t)c0);280if((0 < s) && (T[s - 1] > c0)) { s = ~s; }281if(c0 != c2) {282if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }283k = SA + BUCKET_B(c2 = c0, c1);284}285assert(k < j);286*k-- = s;287} else if(s != 0) {288*j = ~s;289#ifndef NDEBUG290} else {291assert(T[s] == c1);292#endif293}294}295}296}297298/* Construct the BWTed string by using299the sorted order of type B suffixes. */300k = SA + BUCKET_A(c2 = T[n - 1]);301*k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1);302/* Scan the suffix array from left to right. */303for(i = SA, j = SA + n, orig = SA; i < j; ++i) {304if(0 < (s = *i)) {305assert(T[s - 1] >= T[s]);306c0 = T[--s];307*i = c0;308if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); }309if(c0 != c2) {310BUCKET_A(c2) = k - SA;311k = SA + BUCKET_A(c2 = c0);312}313assert(i < k);314*k++ = s;315} else if(s != 0) {316*i = ~s;317} else {318orig = i;319}320}321322return orig - SA;323}324325326/*---------------------------------------------------------------------------*/327328/*- Function -*/329330saint_t331divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) {332saidx_t *bucket_A, *bucket_B;333saidx_t m;334saint_t err = 0;335336/* Check arguments. */337if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }338else if(n == 0) { return 0; }339else if(n == 1) { SA[0] = 0; return 0; }340else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }341342bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));343bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));344345/* Suffixsort. */346if((bucket_A != NULL) && (bucket_B != NULL)) {347m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);348construct_SA(T, SA, bucket_A, bucket_B, n, m);349} else {350err = -2;351}352353free(bucket_B);354free(bucket_A);355356return err;357}358359saidx_t360divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) {361saidx_t *B;362saidx_t *bucket_A, *bucket_B;363saidx_t m, pidx, i;364365/* Check arguments. */366if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }367else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }368369if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); }370bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));371bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));372373/* Burrows-Wheeler Transform. */374if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {375m = sort_typeBstar(T, B, bucket_A, bucket_B, n);376pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);377378/* Copy to output string. */379U[0] = T[n - 1];380for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; }381for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; }382pidx += 1;383} else {384pidx = -2;385}386387free(bucket_B);388free(bucket_A);389if(A == NULL) { free(B); }390391return pidx;392}393394const char *395divsufsort_version(void) {396return PROJECT_VERSION_FULL;397}398399400