// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*****************************************************************************4* Copyright (C) 1996-2015, International Business Machines Corporation and5* others. All Rights Reserved.6*****************************************************************************7*/89#include "unicode/utypes.h"1011#if !UCONFIG_NO_NORMALIZATION1213#include "unicode/caniter.h"14#include "unicode/normalizer2.h"15#include "unicode/uchar.h"16#include "unicode/uniset.h"17#include "unicode/usetiter.h"18#include "unicode/ustring.h"19#include "unicode/utf16.h"20#include "cmemory.h"21#include "hash.h"22#include "normalizer2impl.h"2324/**25* This class allows one to iterate through all the strings that are canonically equivalent to a given26* string. For example, here are some sample results:27Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}281: \u0041\u030A\u0064\u0307\u032729= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}302: \u0041\u030A\u0064\u0327\u030731= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}323: \u0041\u030A\u1E0B\u032733= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}344: \u0041\u030A\u1E11\u030735= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}365: \u00C5\u0064\u0307\u032737= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}386: \u00C5\u0064\u0327\u030739= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}407: \u00C5\u1E0B\u032741= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}428: \u00C5\u1E11\u030743= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}449: \u212B\u0064\u0307\u032745= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}4610: \u212B\u0064\u0327\u030747= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}4811: \u212B\u1E0B\u032749= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}5012: \u212B\u1E11\u030751= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}52*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,53* since it has not been optimized for that situation.54*@author M. Davis55*@draft56*/5758// public5960U_NAMESPACE_BEGIN6162// TODO: add boilerplate methods.6364UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)6566/**67*@param source string to get results for68*/69CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :70pieces(NULL),71pieces_length(0),72pieces_lengths(NULL),73current(NULL),74current_length(0),75nfd(*Normalizer2::getNFDInstance(status)),76nfcImpl(*Normalizer2Factory::getNFCImpl(status))77{78if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) {79setSource(sourceStr, status);80}81}8283CanonicalIterator::~CanonicalIterator() {84cleanPieces();85}8687void CanonicalIterator::cleanPieces() {88int32_t i = 0;89if(pieces != NULL) {90for(i = 0; i < pieces_length; i++) {91if(pieces[i] != NULL) {92delete[] pieces[i];93}94}95uprv_free(pieces);96pieces = NULL;97pieces_length = 0;98}99if(pieces_lengths != NULL) {100uprv_free(pieces_lengths);101pieces_lengths = NULL;102}103if(current != NULL) {104uprv_free(current);105current = NULL;106current_length = 0;107}108}109110/**111*@return gets the source: NOTE: it is the NFD form of source112*/113UnicodeString CanonicalIterator::getSource() {114return source;115}116117/**118* Resets the iterator so that one can start again from the beginning.119*/120void CanonicalIterator::reset() {121done = false;122for (int i = 0; i < current_length; ++i) {123current[i] = 0;124}125}126127/**128*@return the next string that is canonically equivalent. The value null is returned when129* the iteration is done.130*/131UnicodeString CanonicalIterator::next() {132int32_t i = 0;133134if (done) {135buffer.setToBogus();136return buffer;137}138139// delete old contents140buffer.remove();141142// construct return value143144for (i = 0; i < pieces_length; ++i) {145buffer.append(pieces[i][current[i]]);146}147//String result = buffer.toString(); // not needed148149// find next value for next time150151for (i = current_length - 1; ; --i) {152if (i < 0) {153done = true;154break;155}156current[i]++;157if (current[i] < pieces_lengths[i]) break; // got sequence158current[i] = 0;159}160return buffer;161}162163/**164*@param set the source string to iterate against. This allows the same iterator to be used165* while changing the source string, saving object creation.166*/167void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {168int32_t list_length = 0;169UChar32 cp = 0;170int32_t start = 0;171int32_t i = 0;172UnicodeString *list = NULL;173174nfd.normalize(newSource, source, status);175if(U_FAILURE(status)) {176return;177}178done = false;179180cleanPieces();181182// catch degenerate case183if (newSource.length() == 0) {184pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));185pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));186pieces_length = 1;187current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));188current_length = 1;189if (pieces == NULL || pieces_lengths == NULL || current == NULL) {190status = U_MEMORY_ALLOCATION_ERROR;191goto CleanPartialInitialization;192}193current[0] = 0;194pieces[0] = new UnicodeString[1];195pieces_lengths[0] = 1;196if (pieces[0] == 0) {197status = U_MEMORY_ALLOCATION_ERROR;198goto CleanPartialInitialization;199}200return;201}202203204list = new UnicodeString[source.length()];205if (list == 0) {206status = U_MEMORY_ALLOCATION_ERROR;207goto CleanPartialInitialization;208}209210// i should initially be the number of code units at the211// start of the string212i = U16_LENGTH(source.char32At(0));213// int32_t i = 1;214// find the segments215// This code iterates through the source string and216// extracts segments that end up on a codepoint that217// doesn't start any decompositions. (Analysis is done218// on the NFD form - see above).219for (; i < source.length(); i += U16_LENGTH(cp)) {220cp = source.char32At(i);221if (nfcImpl.isCanonSegmentStarter(cp)) {222source.extract(start, i-start, list[list_length++]); // add up to i223start = i;224}225}226source.extract(start, i-start, list[list_length++]); // add last one227228229// allocate the arrays, and find the strings that are CE to each segment230pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));231pieces_length = list_length;232pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));233current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));234current_length = list_length;235if (pieces == NULL || pieces_lengths == NULL || current == NULL) {236status = U_MEMORY_ALLOCATION_ERROR;237goto CleanPartialInitialization;238}239240for (i = 0; i < current_length; i++) {241current[i] = 0;242}243// for each segment, get all the combinations that can produce244// it after NFD normalization245for (i = 0; i < pieces_length; ++i) {246//if (PROGRESS) printf("SEGMENT\n");247pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);248}249250delete[] list;251return;252// Common section to cleanup all local variables and reset object variables.253CleanPartialInitialization:254if (list != NULL) {255delete[] list;256}257cleanPieces();258}259260/**261* Dumb recursive implementation of permutation.262* TODO: optimize263* @param source the string to find permutations for264* @return the results in a set.265*/266void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {267if(U_FAILURE(status)) {268return;269}270//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));271int32_t i = 0;272273// optimization:274// if zero or one character, just return a set with it275// we check for length < 2 to keep from counting code points all the time276if (source.length() <= 2 && source.countChar32() <= 1) {277UnicodeString *toPut = new UnicodeString(source);278/* test for NULL */279if (toPut == 0) {280status = U_MEMORY_ALLOCATION_ERROR;281return;282}283result->put(source, toPut, status);284return;285}286287// otherwise iterate through the string, and recursively permute all the other characters288UChar32 cp;289Hashtable subpermute(status);290if(U_FAILURE(status)) {291return;292}293subpermute.setValueDeleter(uprv_deleteUObject);294295for (i = 0; i < source.length(); i += U16_LENGTH(cp)) {296cp = source.char32At(i);297const UHashElement *ne = NULL;298int32_t el = UHASH_FIRST;299UnicodeString subPermuteString = source;300301// optimization:302// if the character is canonical combining class zero,303// don't permute it304if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {305//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));306continue;307}308309subpermute.removeAll();310311// see what the permutations of the characters before and after this one are312//Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));313permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status);314/* Test for buffer overflows */315if(U_FAILURE(status)) {316return;317}318// The upper remove is destructive. The question is do we have to make a copy, or we don't care about the contents319// of source at this point.320321// prefix this character to all of them322ne = subpermute.nextElement(el);323while (ne != NULL) {324UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);325UnicodeString *chStr = new UnicodeString(cp);326//test for NULL327if (chStr == NULL) {328status = U_MEMORY_ALLOCATION_ERROR;329return;330}331chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));332//if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));333result->put(*chStr, chStr, status);334ne = subpermute.nextElement(el);335}336}337//return result;338}339340// privates341342// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.343UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {344Hashtable result(status);345Hashtable permutations(status);346Hashtable basic(status);347if (U_FAILURE(status)) {348return 0;349}350result.setValueDeleter(uprv_deleteUObject);351permutations.setValueDeleter(uprv_deleteUObject);352basic.setValueDeleter(uprv_deleteUObject);353354UChar USeg[256];355int32_t segLen = segment.extract(USeg, 256, status);356getEquivalents2(&basic, USeg, segLen, status);357358// now get all the permutations359// add only the ones that are canonically equivalent360// TODO: optimize by not permuting any class zero.361362const UHashElement *ne = NULL;363int32_t el = UHASH_FIRST;364//Iterator it = basic.iterator();365ne = basic.nextElement(el);366//while (it.hasNext())367while (ne != NULL) {368//String item = (String) it.next();369UnicodeString item = *((UnicodeString *)(ne->value.pointer));370371permutations.removeAll();372permute(item, CANITER_SKIP_ZEROES, &permutations, status);373const UHashElement *ne2 = NULL;374int32_t el2 = UHASH_FIRST;375//Iterator it2 = permutations.iterator();376ne2 = permutations.nextElement(el2);377//while (it2.hasNext())378while (ne2 != NULL) {379//String possible = (String) it2.next();380//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));381UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));382UnicodeString attempt;383nfd.normalize(possible, attempt, status);384385// TODO: check if operator == is semanticaly the same as attempt.equals(segment)386if (attempt==segment) {387//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));388// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).389result.put(possible, new UnicodeString(possible), status); //add(possible);390} else {391//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));392}393394ne2 = permutations.nextElement(el2);395}396ne = basic.nextElement(el);397}398399/* Test for buffer overflows */400if(U_FAILURE(status)) {401return 0;402}403// convert into a String[] to clean up storage404//String[] finalResult = new String[result.size()];405UnicodeString *finalResult = NULL;406int32_t resultCount;407if((resultCount = result.count()) != 0) {408finalResult = new UnicodeString[resultCount];409if (finalResult == 0) {410status = U_MEMORY_ALLOCATION_ERROR;411return NULL;412}413}414else {415status = U_ILLEGAL_ARGUMENT_ERROR;416return NULL;417}418//result.toArray(finalResult);419result_len = 0;420el = UHASH_FIRST;421ne = result.nextElement(el);422while(ne != NULL) {423finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer));424ne = result.nextElement(el);425}426427428return finalResult;429}430431Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) {432433if (U_FAILURE(status)) {434return NULL;435}436437//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));438439UnicodeString toPut(segment, segLen);440441fillinResult->put(toPut, new UnicodeString(toPut), status);442443UnicodeSet starts;444445// cycle through all the characters446UChar32 cp;447for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {448// see if any character is at the start of some decomposition449U16_GET(segment, 0, i, segLen, cp);450if (!nfcImpl.getCanonStartSet(cp, starts)) {451continue;452}453// if so, see which decompositions match454UnicodeSetIterator iter(starts);455while (iter.next()) {456UChar32 cp2 = iter.getCodepoint();457Hashtable remainder(status);458remainder.setValueDeleter(uprv_deleteUObject);459if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) {460continue;461}462463// there were some matches, so add all the possibilities to the set.464UnicodeString prefix(segment, i);465prefix += cp2;466467int32_t el = UHASH_FIRST;468const UHashElement *ne = remainder.nextElement(el);469while (ne != NULL) {470UnicodeString item = *((UnicodeString *)(ne->value.pointer));471UnicodeString *toAdd = new UnicodeString(prefix);472/* test for NULL */473if (toAdd == 0) {474status = U_MEMORY_ALLOCATION_ERROR;475return NULL;476}477*toAdd += item;478fillinResult->put(*toAdd, toAdd, status);479480//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));481482ne = remainder.nextElement(el);483}484}485}486487/* Test for buffer overflows */488if(U_FAILURE(status)) {489return NULL;490}491return fillinResult;492}493494/**495* See if the decomposition of cp2 is at segment starting at segmentPos496* (with canonical rearrangement!)497* If so, take the remainder, and return the equivalents498*/499Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {500//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {501//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));502//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);503504if (U_FAILURE(status)) {505return NULL;506}507508UnicodeString temp(comp);509int32_t inputLen=temp.length();510UnicodeString decompString;511nfd.normalize(temp, decompString, status);512if (U_FAILURE(status)) {513return NULL;514}515if (decompString.isBogus()) {516status = U_MEMORY_ALLOCATION_ERROR;517return NULL;518}519const UChar *decomp=decompString.getBuffer();520int32_t decompLen=decompString.length();521522// See if it matches the start of segment (at segmentPos)523UBool ok = false;524UChar32 cp;525int32_t decompPos = 0;526UChar32 decompCp;527U16_NEXT(decomp, decompPos, decompLen, decompCp);528529int32_t i = segmentPos;530while(i < segLen) {531U16_NEXT(segment, i, segLen, cp);532533if (cp == decompCp) { // if equal, eat another cp from decomp534535//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));536537if (decompPos == decompLen) { // done, have all decomp characters!538temp.append(segment+i, segLen-i);539ok = true;540break;541}542U16_NEXT(decomp, decompPos, decompLen, decompCp);543} else {544//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));545546// brute force approach547temp.append(cp);548549/* TODO: optimize550// since we know that the classes are monotonically increasing, after zero551// e.g. 0 5 7 9 0 3552// we can do an optimization553// there are only a few cases that work: zero, less, same, greater554// if both classes are the same, we fail555// if the decomp class < the segment class, we fail556557segClass = getClass(cp);558if (decompClass <= segClass) return null;559*/560}561}562if (!ok)563return NULL; // we failed, characters left over564565//if (PROGRESS) printf("Matches\n");566567if (inputLen == temp.length()) {568fillinResult->put(UnicodeString(), new UnicodeString(), status);569return fillinResult; // succeed, but no remainder570}571572// brute force approach573// check to make sure result is canonically equivalent574UnicodeString trial;575nfd.normalize(temp, trial, status);576if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {577return NULL;578}579580return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);581}582583U_NAMESPACE_END584585#endif /* #if !UCONFIG_NO_NORMALIZATION */586587588