Path: blob/master/thirdparty/icu4c/common/caniter.cpp
9903 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*****************************************************************************4* Copyright (C) 1996-2015, International Business Machines Corporation and5* others. All Rights Reserved.6*****************************************************************************7*/89#include "unicode/utypes.h"1011#if !UCONFIG_NO_NORMALIZATION1213#include "unicode/caniter.h"14#include "unicode/normalizer2.h"15#include "unicode/uchar.h"16#include "unicode/uniset.h"17#include "unicode/usetiter.h"18#include "unicode/ustring.h"19#include "unicode/utf16.h"20#include "cmemory.h"21#include "hash.h"22#include "normalizer2impl.h"2324/**25* This class allows one to iterate through all the strings that are canonically equivalent to a given26* string. For example, here are some sample results:27Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}281: \u0041\u030A\u0064\u0307\u032729= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}302: \u0041\u030A\u0064\u0327\u030731= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}323: \u0041\u030A\u1E0B\u032733= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}344: \u0041\u030A\u1E11\u030735= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}365: \u00C5\u0064\u0307\u032737= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}386: \u00C5\u0064\u0327\u030739= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}407: \u00C5\u1E0B\u032741= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}428: \u00C5\u1E11\u030743= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}449: \u212B\u0064\u0307\u032745= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}4610: \u212B\u0064\u0327\u030747= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}4811: \u212B\u1E0B\u032749= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}5012: \u212B\u1E11\u030751= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}52*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,53* since it has not been optimized for that situation.54*@author M. Davis55*@draft56*/5758// public5960U_NAMESPACE_BEGIN6162// TODO: add boilerplate methods.6364UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)656667/**68*@param source string to get results for69*/70CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :71pieces(nullptr),72pieces_length(0),73pieces_lengths(nullptr),74current(nullptr),75current_length(0),76nfd(Normalizer2::getNFDInstance(status)),77nfcImpl(Normalizer2Factory::getNFCImpl(status))78{79if(U_SUCCESS(status) && nfcImpl->ensureCanonIterData(status)) {80setSource(sourceStr, status);81}82}8384CanonicalIterator::~CanonicalIterator() {85cleanPieces();86}8788void CanonicalIterator::cleanPieces() {89int32_t i = 0;90if(pieces != nullptr) {91for(i = 0; i < pieces_length; i++) {92if(pieces[i] != nullptr) {93delete[] pieces[i];94}95}96uprv_free(pieces);97pieces = nullptr;98pieces_length = 0;99}100if(pieces_lengths != nullptr) {101uprv_free(pieces_lengths);102pieces_lengths = nullptr;103}104if(current != nullptr) {105uprv_free(current);106current = nullptr;107current_length = 0;108}109}110111/**112*@return gets the source: NOTE: it is the NFD form of source113*/114UnicodeString CanonicalIterator::getSource() {115return source;116}117118/**119* Resets the iterator so that one can start again from the beginning.120*/121void CanonicalIterator::reset() {122done = false;123for (int i = 0; i < current_length; ++i) {124current[i] = 0;125}126}127128/**129*@return the next string that is canonically equivalent. The value null is returned when130* the iteration is done.131*/132UnicodeString CanonicalIterator::next() {133int32_t i = 0;134135if (done) {136buffer.setToBogus();137return buffer;138}139140// delete old contents141buffer.remove();142143// construct return value144145for (i = 0; i < pieces_length; ++i) {146buffer.append(pieces[i][current[i]]);147}148//String result = buffer.toString(); // not needed149150// find next value for next time151152for (i = current_length - 1; ; --i) {153if (i < 0) {154done = true;155break;156}157current[i]++;158if (current[i] < pieces_lengths[i]) break; // got sequence159current[i] = 0;160}161return buffer;162}163164/**165*@param set the source string to iterate against. This allows the same iterator to be used166* while changing the source string, saving object creation.167*/168void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {169int32_t list_length = 0;170UChar32 cp = 0;171int32_t start = 0;172int32_t i = 0;173UnicodeString *list = nullptr;174175nfd->normalize(newSource, source, status);176if(U_FAILURE(status)) {177return;178}179done = false;180181cleanPieces();182183// catch degenerate case184if (newSource.length() == 0) {185pieces = static_cast<UnicodeString**>(uprv_malloc(sizeof(UnicodeString*)));186pieces_lengths = static_cast<int32_t*>(uprv_malloc(1 * sizeof(int32_t)));187pieces_length = 1;188current = static_cast<int32_t*>(uprv_malloc(1 * sizeof(int32_t)));189current_length = 1;190if (pieces == nullptr || pieces_lengths == nullptr || current == nullptr) {191status = U_MEMORY_ALLOCATION_ERROR;192goto CleanPartialInitialization;193}194current[0] = 0;195pieces[0] = new UnicodeString[1];196pieces_lengths[0] = 1;197if (pieces[0] == nullptr) {198status = U_MEMORY_ALLOCATION_ERROR;199goto CleanPartialInitialization;200}201return;202}203204205list = new UnicodeString[source.length()];206if (list == nullptr) {207status = U_MEMORY_ALLOCATION_ERROR;208goto CleanPartialInitialization;209}210211// i should initially be the number of code units at the212// start of the string213i = U16_LENGTH(source.char32At(0));214// int32_t i = 1;215// find the segments216// This code iterates through the source string and217// extracts segments that end up on a codepoint that218// doesn't start any decompositions. (Analysis is done219// on the NFD form - see above).220for (; i < source.length(); i += U16_LENGTH(cp)) {221cp = source.char32At(i);222if (nfcImpl->isCanonSegmentStarter(cp)) {223source.extract(start, i-start, list[list_length++]); // add up to i224start = i;225}226}227source.extract(start, i-start, list[list_length++]); // add last one228229230// allocate the arrays, and find the strings that are CE to each segment231pieces = static_cast<UnicodeString**>(uprv_malloc(list_length * sizeof(UnicodeString*)));232pieces_length = list_length;233pieces_lengths = static_cast<int32_t*>(uprv_malloc(list_length * sizeof(int32_t)));234current = static_cast<int32_t*>(uprv_malloc(list_length * sizeof(int32_t)));235current_length = list_length;236if (pieces == nullptr || pieces_lengths == nullptr || current == nullptr) {237status = U_MEMORY_ALLOCATION_ERROR;238goto CleanPartialInitialization;239}240241for (i = 0; i < current_length; i++) {242current[i] = 0;243}244// for each segment, get all the combinations that can produce245// it after NFD normalization246for (i = 0; i < pieces_length; ++i) {247//if (PROGRESS) printf("SEGMENT\n");248pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);249}250251delete[] list;252return;253// Common section to cleanup all local variables and reset object variables.254CleanPartialInitialization:255delete[] list;256cleanPieces();257}258259/**260* Dumb recursive implementation of permutation.261* TODO: optimize262* @param source the string to find permutations for263* @return the results in a set.264*/265void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status, int32_t depth) {266if(U_FAILURE(status)) {267return;268}269// To avoid infinity loop caused by permute, we limit the depth of recursive270// call to permute and return U_UNSUPPORTED_ERROR.271// We know in some unit test we need at least 4. Set to 8 just in case some272// unforseen use cases.273constexpr int32_t kPermuteDepthLimit = 8;274if (depth > kPermuteDepthLimit) {275status = U_UNSUPPORTED_ERROR;276return;277}278//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));279int32_t i = 0;280281// optimization:282// if zero or one character, just return a set with it283// we check for length < 2 to keep from counting code points all the time284if (source.length() <= 2 && source.countChar32() <= 1) {285UnicodeString *toPut = new UnicodeString(source);286/* test for nullptr */287if (toPut == nullptr) {288status = U_MEMORY_ALLOCATION_ERROR;289return;290}291result->put(source, toPut, status);292return;293}294295// otherwise iterate through the string, and recursively permute all the other characters296UChar32 cp;297Hashtable subpermute(status);298if(U_FAILURE(status)) {299return;300}301subpermute.setValueDeleter(uprv_deleteUObject);302303for (i = 0; i < source.length(); i += U16_LENGTH(cp)) {304cp = source.char32At(i);305const UHashElement *ne = nullptr;306int32_t el = UHASH_FIRST;307UnicodeString subPermuteString = source;308309// optimization:310// if the character is canonical combining class zero,311// don't permute it312if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {313//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));314continue;315}316317subpermute.removeAll();318319// see what the permutations of the characters before and after this one are320//Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));321permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status, depth+1);322/* Test for buffer overflows */323if(U_FAILURE(status)) {324return;325}326// The upper remove is destructive. The question is do we have to make a copy, or we don't care about the contents327// of source at this point.328329// prefix this character to all of them330ne = subpermute.nextElement(el);331while (ne != nullptr) {332UnicodeString* permRes = static_cast<UnicodeString*>(ne->value.pointer);333UnicodeString *chStr = new UnicodeString(cp);334//test for nullptr335if (chStr == nullptr) {336status = U_MEMORY_ALLOCATION_ERROR;337return;338}339chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));340//if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));341result->put(*chStr, chStr, status);342ne = subpermute.nextElement(el);343}344}345//return result;346}347348// privates349350// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.351UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {352Hashtable result(status);353Hashtable permutations(status);354Hashtable basic(status);355if (U_FAILURE(status)) {356return nullptr;357}358result.setValueDeleter(uprv_deleteUObject);359permutations.setValueDeleter(uprv_deleteUObject);360basic.setValueDeleter(uprv_deleteUObject);361362char16_t USeg[256];363int32_t segLen = segment.extract(USeg, 256, status);364getEquivalents2(&basic, USeg, segLen, status);365if (U_FAILURE(status)) {366return nullptr;367}368369// now get all the permutations370// add only the ones that are canonically equivalent371// TODO: optimize by not permuting any class zero.372373const UHashElement *ne = nullptr;374int32_t el = UHASH_FIRST;375//Iterator it = basic.iterator();376ne = basic.nextElement(el);377//while (it.hasNext())378while (ne != nullptr) {379//String item = (String) it.next();380UnicodeString item = *static_cast<UnicodeString*>(ne->value.pointer);381382permutations.removeAll();383permute(item, CANITER_SKIP_ZEROES, &permutations, status);384const UHashElement *ne2 = nullptr;385int32_t el2 = UHASH_FIRST;386//Iterator it2 = permutations.iterator();387ne2 = permutations.nextElement(el2);388//while (it2.hasNext())389while (ne2 != nullptr) {390//String possible = (String) it2.next();391//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));392UnicodeString possible(*static_cast<UnicodeString*>(ne2->value.pointer));393UnicodeString attempt;394nfd->normalize(possible, attempt, status);395396// TODO: check if operator == is semanticaly the same as attempt.equals(segment)397if (attempt==segment) {398//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));399// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).400result.put(possible, new UnicodeString(possible), status); //add(possible);401} else {402//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));403}404405ne2 = permutations.nextElement(el2);406}407ne = basic.nextElement(el);408}409410/* Test for buffer overflows */411if(U_FAILURE(status)) {412return nullptr;413}414// convert into a String[] to clean up storage415//String[] finalResult = new String[result.size()];416UnicodeString *finalResult = nullptr;417int32_t resultCount;418if((resultCount = result.count()) != 0) {419finalResult = new UnicodeString[resultCount];420if (finalResult == nullptr) {421status = U_MEMORY_ALLOCATION_ERROR;422return nullptr;423}424}425else {426status = U_ILLEGAL_ARGUMENT_ERROR;427return nullptr;428}429//result.toArray(finalResult);430result_len = 0;431el = UHASH_FIRST;432ne = result.nextElement(el);433while(ne != nullptr) {434finalResult[result_len++] = *static_cast<UnicodeString*>(ne->value.pointer);435ne = result.nextElement(el);436}437438439return finalResult;440}441442Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const char16_t *segment, int32_t segLen, UErrorCode &status) {443444if (U_FAILURE(status)) {445return nullptr;446}447448//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));449450UnicodeString toPut(segment, segLen);451452fillinResult->put(toPut, new UnicodeString(toPut), status);453454UnicodeSet starts;455456// cycle through all the characters457UChar32 cp;458for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {459// see if any character is at the start of some decomposition460U16_GET(segment, 0, i, segLen, cp);461if (!nfcImpl->getCanonStartSet(cp, starts)) {462continue;463}464// if so, see which decompositions match465UnicodeSetIterator iter(starts);466while (iter.next()) {467UChar32 cp2 = iter.getCodepoint();468Hashtable remainder(status);469remainder.setValueDeleter(uprv_deleteUObject);470if (extract(&remainder, cp2, segment, segLen, i, status) == nullptr) {471if (U_FAILURE(status)) {472return nullptr;473}474continue;475}476477// there were some matches, so add all the possibilities to the set.478UnicodeString prefix(segment, i);479prefix += cp2;480481int32_t el = UHASH_FIRST;482const UHashElement *ne = remainder.nextElement(el);483while (ne != nullptr) {484UnicodeString item = *static_cast<UnicodeString*>(ne->value.pointer);485UnicodeString *toAdd = new UnicodeString(prefix);486/* test for nullptr */487if (toAdd == nullptr) {488status = U_MEMORY_ALLOCATION_ERROR;489return nullptr;490}491*toAdd += item;492fillinResult->put(*toAdd, toAdd, status);493494//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));495496ne = remainder.nextElement(el);497}498// ICU-22642 Guards against strings that have so many permutations499// that they would otherwise hang the function.500constexpr int32_t kResultLimit = 4096;501if (fillinResult->count() > kResultLimit) {502status = U_UNSUPPORTED_ERROR;503return nullptr;504}505}506}507508/* Test for buffer overflows */509if(U_FAILURE(status)) {510return nullptr;511}512return fillinResult;513}514515/**516* See if the decomposition of cp2 is at segment starting at segmentPos517* (with canonical rearrangement!)518* If so, take the remainder, and return the equivalents519*/520Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const char16_t *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {521//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {522//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));523//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);524525if (U_FAILURE(status)) {526return nullptr;527}528529UnicodeString temp(comp);530int32_t inputLen=temp.length();531UnicodeString decompString;532nfd->normalize(temp, decompString, status);533if (U_FAILURE(status)) {534return nullptr;535}536if (decompString.isBogus()) {537status = U_MEMORY_ALLOCATION_ERROR;538return nullptr;539}540const char16_t *decomp=decompString.getBuffer();541int32_t decompLen=decompString.length();542543// See if it matches the start of segment (at segmentPos)544UBool ok = false;545UChar32 cp;546int32_t decompPos = 0;547UChar32 decompCp;548U16_NEXT(decomp, decompPos, decompLen, decompCp);549550int32_t i = segmentPos;551while(i < segLen) {552U16_NEXT(segment, i, segLen, cp);553554if (cp == decompCp) { // if equal, eat another cp from decomp555556//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));557558if (decompPos == decompLen) { // done, have all decomp characters!559temp.append(segment+i, segLen-i);560ok = true;561break;562}563U16_NEXT(decomp, decompPos, decompLen, decompCp);564} else {565//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));566567// brute force approach568temp.append(cp);569570/* TODO: optimize571// since we know that the classes are monotonically increasing, after zero572// e.g. 0 5 7 9 0 3573// we can do an optimization574// there are only a few cases that work: zero, less, same, greater575// if both classes are the same, we fail576// if the decomp class < the segment class, we fail577578segClass = getClass(cp);579if (decompClass <= segClass) return null;580*/581}582}583if (!ok)584return nullptr; // we failed, characters left over585586//if (PROGRESS) printf("Matches\n");587588if (inputLen == temp.length()) {589fillinResult->put(UnicodeString(), new UnicodeString(), status);590return fillinResult; // succeed, but no remainder591}592593// brute force approach594// check to make sure result is canonically equivalent595UnicodeString trial;596nfd->normalize(temp, trial, status);597if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {598return nullptr;599}600601return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);602}603604U_NAMESPACE_END605606#endif /* #if !UCONFIG_NO_NORMALIZATION */607608609