Path: blob/master/thirdparty/icu4c/common/brkiter.cpp
9903 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 1997-2015, International Business Machines Corporation and5* others. All Rights Reserved.6*******************************************************************************7*8* File brkiter.cpp9*10* Modification History:11*12* Date Name Description13* 02/18/97 aliu Converted from OpenClass. Added DONE.14* 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.15*****************************************************************************************16*/1718// *****************************************************************************19// This file was generated from the java source file BreakIterator.java20// *****************************************************************************2122#include "unicode/utypes.h"2324#if !UCONFIG_NO_BREAK_ITERATION2526#include "unicode/rbbi.h"27#include "unicode/brkiter.h"28#include "unicode/udata.h"29#include "unicode/uloc.h"30#include "unicode/ures.h"31#include "unicode/ustring.h"32#include "unicode/filteredbrk.h"33#include "bytesinkutil.h"34#include "ucln_cmn.h"35#include "cstring.h"36#include "umutex.h"37#include "servloc.h"38#include "locbased.h"39#include "uresimp.h"40#include "uassert.h"41#include "ubrkimpl.h"42#include "utracimp.h"43#include "charstr.h"4445// *****************************************************************************46// class BreakIterator47// This class implements methods for finding the location of boundaries in text.48// Instances of BreakIterator maintain a current position and scan over text49// returning the index of characters where boundaries occur.50// *****************************************************************************5152U_NAMESPACE_BEGIN5354// -------------------------------------5556BreakIterator*57BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)58{59char fnbuff[256];60char ext[4]={'\0'};61CharString actual;62int32_t size;63const char16_t* brkfname = nullptr;64UResourceBundle brkRulesStack;65UResourceBundle brkNameStack;66UResourceBundle *brkRules = &brkRulesStack;67UResourceBundle *brkName = &brkNameStack;68RuleBasedBreakIterator *result = nullptr;6970if (U_FAILURE(status))71return nullptr;7273ures_initStackObject(brkRules);74ures_initStackObject(brkName);7576// Get the locale77UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);7879// Get the "boundaries" array.80if (U_SUCCESS(status)) {81brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);82// Get the string object naming the rules file83brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);84// Get the actual string85brkfname = ures_getString(brkName, &size, &status);86U_ASSERT((size_t)size<sizeof(fnbuff));87if (static_cast<size_t>(size) >= sizeof(fnbuff)) {88size=0;89if (U_SUCCESS(status)) {90status = U_BUFFER_OVERFLOW_ERROR;91}92}9394// Use the string if we found it95if (U_SUCCESS(status) && brkfname) {96actual.append(ures_getLocaleInternal(brkName, &status), -1, status);9798char16_t* extStart=u_strchr(brkfname, 0x002e);99int len = 0;100if (extStart != nullptr){101len = static_cast<int>(extStart - brkfname);102u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff103u_UCharsToChars(brkfname, fnbuff, len);104}105fnbuff[len]=0; // nul terminate106}107}108109ures_close(brkRules);110ures_close(brkName);111112UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);113if (U_FAILURE(status)) {114ures_close(b);115return nullptr;116}117118// Create a RuleBasedBreakIterator119result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);120121// If there is a result, set the valid locale and actual locale, and the kind122if (U_SUCCESS(status) && result != nullptr) {123U_LOCALE_BASED(locBased, *(BreakIterator*)result);124125locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),126actual.data(), status);127LocaleBased::setLocaleID(loc.getName(), result->requestLocale, status);128}129130ures_close(b);131132if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple133delete result;134return nullptr;135}136137if (result == nullptr) {138udata_close(file);139if (U_SUCCESS(status)) {140status = U_MEMORY_ALLOCATION_ERROR;141}142}143144return result;145}146147// Creates a break iterator for word breaks.148BreakIterator* U_EXPORT2149BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)150{151return createInstance(key, UBRK_WORD, status);152}153154// -------------------------------------155156// Creates a break iterator for line breaks.157BreakIterator* U_EXPORT2158BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)159{160return createInstance(key, UBRK_LINE, status);161}162163// -------------------------------------164165// Creates a break iterator for character breaks.166BreakIterator* U_EXPORT2167BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)168{169return createInstance(key, UBRK_CHARACTER, status);170}171172// -------------------------------------173174// Creates a break iterator for sentence breaks.175BreakIterator* U_EXPORT2176BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)177{178return createInstance(key, UBRK_SENTENCE, status);179}180181// -------------------------------------182183// Creates a break iterator for title casing breaks.184BreakIterator* U_EXPORT2185BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)186{187return createInstance(key, UBRK_TITLE, status);188}189190// -------------------------------------191192// Gets all the available locales that has localized text boundary data.193const Locale* U_EXPORT2194BreakIterator::getAvailableLocales(int32_t& count)195{196return Locale::getAvailableLocales(count);197}198199// ------------------------------------------200//201// Constructors, destructor and assignment operator202//203//-------------------------------------------204205BreakIterator::BreakIterator()206{207}208209BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {210UErrorCode status = U_ZERO_ERROR;211U_LOCALE_BASED(locBased, *this);212locBased.setLocaleIDs(other.validLocale, other.actualLocale, status);213LocaleBased::setLocaleID(other.requestLocale, requestLocale, status);214U_ASSERT(U_SUCCESS(status));215}216217BreakIterator &BreakIterator::operator =(const BreakIterator &other) {218if (this != &other) {219UErrorCode status = U_ZERO_ERROR;220U_LOCALE_BASED(locBased, *this);221locBased.setLocaleIDs(other.validLocale, other.actualLocale, status);222LocaleBased::setLocaleID(other.requestLocale, requestLocale, status);223U_ASSERT(U_SUCCESS(status));224}225return *this;226}227228BreakIterator::~BreakIterator()229{230delete validLocale;231delete actualLocale;232delete requestLocale;233}234235// ------------------------------------------236//237// Registration238//239//-------------------------------------------240#if !UCONFIG_NO_SERVICE241242// -------------------------------------243244class ICUBreakIteratorFactory : public ICUResourceBundleFactory {245public:246virtual ~ICUBreakIteratorFactory();247protected:248virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override {249return BreakIterator::makeInstance(loc, kind, status);250}251};252253ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}254255// -------------------------------------256257class ICUBreakIteratorService : public ICULocaleService {258public:259ICUBreakIteratorService()260: ICULocaleService(UNICODE_STRING("Break Iterator", 14))261{262UErrorCode status = U_ZERO_ERROR;263registerFactory(new ICUBreakIteratorFactory(), status);264}265266virtual ~ICUBreakIteratorService();267268virtual UObject* cloneInstance(UObject* instance) const override {269return ((BreakIterator*)instance)->clone();270}271272virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override {273LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key));274int32_t kind = lkey.kind();275Locale loc;276lkey.currentLocale(loc);277return BreakIterator::makeInstance(loc, kind, status);278}279280virtual UBool isDefault() const override {281return countFactories() == 1;282}283};284285ICUBreakIteratorService::~ICUBreakIteratorService() {}286287// -------------------------------------288289// defined in ucln_cmn.h290U_NAMESPACE_END291292static icu::UInitOnce gInitOnceBrkiter {};293static icu::ICULocaleService* gService = nullptr;294295296297/**298* Release all static memory held by breakiterator.299*/300U_CDECL_BEGIN301static UBool U_CALLCONV breakiterator_cleanup() {302#if !UCONFIG_NO_SERVICE303if (gService) {304delete gService;305gService = nullptr;306}307gInitOnceBrkiter.reset();308#endif309return true;310}311U_CDECL_END312U_NAMESPACE_BEGIN313314static void U_CALLCONV315initService() {316gService = new ICUBreakIteratorService();317ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);318}319320static ICULocaleService*321getService()322{323umtx_initOnce(gInitOnceBrkiter, &initService);324return gService;325}326327328// -------------------------------------329330static inline UBool331hasService()332{333return !gInitOnceBrkiter.isReset() && getService() != nullptr;334}335336// -------------------------------------337338URegistryKey U_EXPORT2339BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)340{341ICULocaleService *service = getService();342if (service == nullptr) {343status = U_MEMORY_ALLOCATION_ERROR;344return nullptr;345}346return service->registerInstance(toAdopt, locale, kind, status);347}348349// -------------------------------------350351UBool U_EXPORT2352BreakIterator::unregister(URegistryKey key, UErrorCode& status)353{354if (U_SUCCESS(status)) {355if (hasService()) {356return gService->unregister(key, status);357}358status = U_MEMORY_ALLOCATION_ERROR;359}360return false;361}362363// -------------------------------------364365StringEnumeration* U_EXPORT2366BreakIterator::getAvailableLocales()367{368ICULocaleService *service = getService();369if (service == nullptr) {370return nullptr;371}372return service->getAvailableLocales();373}374#endif /* UCONFIG_NO_SERVICE */375376// -------------------------------------377378BreakIterator*379BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)380{381if (U_FAILURE(status)) {382return nullptr;383}384385#if !UCONFIG_NO_SERVICE386if (hasService()) {387Locale actualLoc("");388BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);389// TODO: The way the service code works in ICU 2.8 is that if390// there is a real registered break iterator, the actualLoc391// will be populated, but if the handleDefault path is taken392// (because nothing is registered that can handle the393// requested locale) then the actualLoc comes back empty. In394// that case, the returned object already has its actual/valid395// locale data populated (by makeInstance, which is what396// handleDefault calls), so we don't touch it. YES, A COMMENT397// THIS LONG is a sign of bad code -- so the action item is to398// revisit this in ICU 3.0 and clean it up/fix it/remove it.399if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) {400U_LOCALE_BASED(locBased, *result);401locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName(), status);402}403return result;404}405else406#endif407{408return makeInstance(loc, kind, status);409}410}411412// -------------------------------------413enum { kKeyValueLenMax = 32 };414415BreakIterator*416BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)417{418419if (U_FAILURE(status)) {420return nullptr;421}422423BreakIterator *result = nullptr;424switch (kind) {425case UBRK_CHARACTER:426{427UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);428result = BreakIterator::buildInstance(loc, "grapheme", status);429UTRACE_EXIT_STATUS(status);430}431break;432case UBRK_WORD:433{434UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);435result = BreakIterator::buildInstance(loc, "word", status);436UTRACE_EXIT_STATUS(status);437}438break;439case UBRK_LINE:440{441char lb_lw[kKeyValueLenMax];442UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);443uprv_strcpy(lb_lw, "line");444UErrorCode kvStatus = U_ZERO_ERROR;445auto value = loc.getKeywordValue<CharString>("lb", kvStatus);446if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {447uprv_strcat(lb_lw, "_");448uprv_strcat(lb_lw, value.data());449}450// lw=phrase is only supported in Japanese and Korean451if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {452value = loc.getKeywordValue<CharString>("lw", kvStatus);453if (U_SUCCESS(kvStatus) && value == "phrase") {454uprv_strcat(lb_lw, "_");455uprv_strcat(lb_lw, value.data());456}457}458result = BreakIterator::buildInstance(loc, lb_lw, status);459460UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);461UTRACE_EXIT_STATUS(status);462}463break;464case UBRK_SENTENCE:465{466UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);467result = BreakIterator::buildInstance(loc, "sentence", status);468#if !UCONFIG_NO_FILTERED_BREAK_ITERATION469char ssKeyValue[kKeyValueLenMax] = {0};470UErrorCode kvStatus = U_ZERO_ERROR;471int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);472if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {473FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);474if (U_SUCCESS(kvStatus)) {475result = fbiBuilder->build(result, status);476delete fbiBuilder;477}478}479#endif480UTRACE_EXIT_STATUS(status);481}482break;483case UBRK_TITLE:484{485UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);486result = BreakIterator::buildInstance(loc, "title", status);487UTRACE_EXIT_STATUS(status);488}489break;490default:491status = U_ILLEGAL_ARGUMENT_ERROR;492}493494if (U_FAILURE(status)) {495delete result;496return nullptr;497}498499return result;500}501502Locale503BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {504if (U_FAILURE(status)) {505return Locale::getRoot();506}507if (type == ULOC_REQUESTED_LOCALE) {508return requestLocale == nullptr ?509Locale::getRoot() : Locale(requestLocale->data());510}511return LocaleBased::getLocale(validLocale, actualLocale, type, status);512}513514const char *515BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {516if (U_FAILURE(status)) {517return nullptr;518}519if (type == ULOC_REQUESTED_LOCALE) {520return requestLocale == nullptr ? "" : requestLocale->data();521}522return LocaleBased::getLocaleID(validLocale, actualLocale, type, status);523}524525526// This implementation of getRuleStatus is a do-nothing stub, here to527// provide a default implementation for any derived BreakIterator classes that528// do not implement it themselves.529int32_t BreakIterator::getRuleStatus() const {530return 0;531}532533// This implementation of getRuleStatusVec is a do-nothing stub, here to534// provide a default implementation for any derived BreakIterator classes that535// do not implement it themselves.536int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {537if (U_FAILURE(status)) {538return 0;539}540if (capacity < 1) {541status = U_BUFFER_OVERFLOW_ERROR;542return 1;543}544*fillInVec = 0;545return 1;546}547548BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {549UErrorCode status = U_ZERO_ERROR;550U_LOCALE_BASED(locBased, (*this));551locBased.setLocaleIDs(valid.getName(), actual.getName(), status);552U_ASSERT(U_SUCCESS(status));553}554555U_NAMESPACE_END556557#endif /* #if !UCONFIG_NO_BREAK_ITERATION */558559//eof560561562