// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/**3*******************************************************************************4* Copyright (C) 2006-2014, International Business Machines Corporation *5* and others. All Rights Reserved. *6*******************************************************************************7*/89#ifndef DICTBE_H10#define DICTBE_H1112#include "unicode/utypes.h"13#include "unicode/uniset.h"14#include "unicode/utext.h"1516#include "brkeng.h"17#include "hash.h"18#include "mlbe.h"19#include "uvectr32.h"2021U_NAMESPACE_BEGIN2223class DictionaryMatcher;24class MlBreakEngine;25class Normalizer2;2627/*******************************************************************28* DictionaryBreakEngine29*/3031/**32* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a33* dictionary to determine language-specific breaks.</p>34*35* <p>After it is constructed a DictionaryBreakEngine may be shared between36* threads without synchronization.</p>37*/38class DictionaryBreakEngine : public LanguageBreakEngine {39private:40/**41* The set of characters handled by this engine42* @internal43*/4445UnicodeSet fSet;4647public:4849/**50* <p>Constructor </p>51*/52DictionaryBreakEngine();5354/**55* <p>Virtual destructor.</p>56*/57virtual ~DictionaryBreakEngine();5859/**60* <p>Indicate whether this engine handles a particular character for61* a particular kind of break.</p>62*63* @param c A character which begins a run that the engine might handle64* @param locale The locale.65* @return true if this engine handles the particular character and break66* type.67*/68virtual UBool handles(UChar32 c, const char* locale) const override;6970/**71* <p>Find any breaks within a run in the supplied text.</p>72*73* @param text A UText representing the text. The iterator is left at74* the end of the run of characters which the engine is capable of handling75* that starts from the first character in the range.76* @param startPos The start of the run within the supplied text.77* @param endPos The end of the run within the supplied text.78* @param foundBreaks vector of int32_t to receive the break positions79* @param status Information on any errors encountered.80* @return The number of breaks found.81*/82virtual int32_t findBreaks( UText *text,83int32_t startPos,84int32_t endPos,85UVector32 &foundBreaks,86UBool isPhraseBreaking,87UErrorCode& status ) const override;8889protected:9091/**92* <p>Set the character set handled by this engine.</p>93*94* @param set A UnicodeSet of the set of characters handled by the engine95*/96virtual void setCharacters( const UnicodeSet &set );9798/**99* <p>Divide up a range of known dictionary characters handled by this break engine.</p>100*101* @param text A UText representing the text102* @param rangeStart The start of the range of dictionary characters103* @param rangeEnd The end of the range of dictionary characters104* @param foundBreaks Output of C array of int32_t break positions, or 0105* @param status Information on any errors encountered.106* @return The number of breaks found107*/108virtual int32_t divideUpDictionaryRange( UText *text,109int32_t rangeStart,110int32_t rangeEnd,111UVector32 &foundBreaks,112UBool isPhraseBreaking,113UErrorCode& status) const = 0;114115};116117/*******************************************************************118* ThaiBreakEngine119*/120121/**122* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a123* dictionary and heuristics to determine Thai-specific breaks.</p>124*125* <p>After it is constructed a ThaiBreakEngine may be shared between126* threads without synchronization.</p>127*/128class ThaiBreakEngine : public DictionaryBreakEngine {129private:130/**131* The set of characters handled by this engine132* @internal133*/134135UnicodeSet fEndWordSet;136UnicodeSet fBeginWordSet;137UnicodeSet fSuffixSet;138UnicodeSet fMarkSet;139DictionaryMatcher *fDictionary;140141public:142143/**144* <p>Default constructor.</p>145*146* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the147* engine is deleted.148*/149ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);150151/**152* <p>Virtual destructor.</p>153*/154virtual ~ThaiBreakEngine();155156protected:157/**158* <p>Divide up a range of known dictionary characters handled by this break engine.</p>159*160* @param text A UText representing the text161* @param rangeStart The start of the range of dictionary characters162* @param rangeEnd The end of the range of dictionary characters163* @param foundBreaks Output of C array of int32_t break positions, or 0164* @param status Information on any errors encountered.165* @return The number of breaks found166*/167virtual int32_t divideUpDictionaryRange( UText *text,168int32_t rangeStart,169int32_t rangeEnd,170UVector32 &foundBreaks,171UBool isPhraseBreaking,172UErrorCode& status) const override;173174};175176/*******************************************************************177* LaoBreakEngine178*/179180/**181* <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a182* dictionary and heuristics to determine Lao-specific breaks.</p>183*184* <p>After it is constructed a LaoBreakEngine may be shared between185* threads without synchronization.</p>186*/187class LaoBreakEngine : public DictionaryBreakEngine {188private:189/**190* The set of characters handled by this engine191* @internal192*/193194UnicodeSet fEndWordSet;195UnicodeSet fBeginWordSet;196UnicodeSet fMarkSet;197DictionaryMatcher *fDictionary;198199public:200201/**202* <p>Default constructor.</p>203*204* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the205* engine is deleted.206*/207LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);208209/**210* <p>Virtual destructor.</p>211*/212virtual ~LaoBreakEngine();213214protected:215/**216* <p>Divide up a range of known dictionary characters handled by this break engine.</p>217*218* @param text A UText representing the text219* @param rangeStart The start of the range of dictionary characters220* @param rangeEnd The end of the range of dictionary characters221* @param foundBreaks Output of C array of int32_t break positions, or 0222* @param status Information on any errors encountered.223* @return The number of breaks found224*/225virtual int32_t divideUpDictionaryRange( UText *text,226int32_t rangeStart,227int32_t rangeEnd,228UVector32 &foundBreaks,229UBool isPhraseBreaking,230UErrorCode& status) const override;231232};233234/*******************************************************************235* BurmeseBreakEngine236*/237238/**239* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a240* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>241*242* <p>After it is constructed a BurmeseBreakEngine may be shared between243* threads without synchronization.</p>244*/245class BurmeseBreakEngine : public DictionaryBreakEngine {246private:247/**248* The set of characters handled by this engine249* @internal250*/251252UnicodeSet fEndWordSet;253UnicodeSet fBeginWordSet;254UnicodeSet fMarkSet;255DictionaryMatcher *fDictionary;256257public:258259/**260* <p>Default constructor.</p>261*262* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the263* engine is deleted.264*/265BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);266267/**268* <p>Virtual destructor.</p>269*/270virtual ~BurmeseBreakEngine();271272protected:273/**274* <p>Divide up a range of known dictionary characters.</p>275*276* @param text A UText representing the text277* @param rangeStart The start of the range of dictionary characters278* @param rangeEnd The end of the range of dictionary characters279* @param foundBreaks Output of C array of int32_t break positions, or 0280* @param status Information on any errors encountered.281* @return The number of breaks found282*/283virtual int32_t divideUpDictionaryRange( UText *text,284int32_t rangeStart,285int32_t rangeEnd,286UVector32 &foundBreaks,287UBool isPhraseBreaking,288UErrorCode& status) const override;289290};291292/*******************************************************************293* KhmerBreakEngine294*/295296/**297* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a298* DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>299*300* <p>After it is constructed a KhmerBreakEngine may be shared between301* threads without synchronization.</p>302*/303class KhmerBreakEngine : public DictionaryBreakEngine {304private:305/**306* The set of characters handled by this engine307* @internal308*/309310UnicodeSet fEndWordSet;311UnicodeSet fBeginWordSet;312UnicodeSet fMarkSet;313DictionaryMatcher *fDictionary;314315public:316317/**318* <p>Default constructor.</p>319*320* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the321* engine is deleted.322*/323KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);324325/**326* <p>Virtual destructor.</p>327*/328virtual ~KhmerBreakEngine();329330protected:331/**332* <p>Divide up a range of known dictionary characters.</p>333*334* @param text A UText representing the text335* @param rangeStart The start of the range of dictionary characters336* @param rangeEnd The end of the range of dictionary characters337* @param foundBreaks Output of C array of int32_t break positions, or 0338* @param status Information on any errors encountered.339* @return The number of breaks found340*/341virtual int32_t divideUpDictionaryRange( UText *text,342int32_t rangeStart,343int32_t rangeEnd,344UVector32 &foundBreaks,345UBool isPhraseBreaking,346UErrorCode& status) const override;347348};349350#if !UCONFIG_NO_NORMALIZATION351352/*******************************************************************353* CjkBreakEngine354*/355356//indicates language/script that the CjkBreakEngine will handle357enum LanguageType {358kKorean,359kChineseJapanese360};361362/**363* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a364* dictionary with costs associated with each word and365* Viterbi decoding to determine CJK-specific breaks.</p>366*/367class CjkBreakEngine : public DictionaryBreakEngine {368protected:369/**370* The set of characters handled by this engine371* @internal372*/373UnicodeSet fHangulWordSet;374UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;375UnicodeSet fClosePunctuationSet;376377DictionaryMatcher *fDictionary;378const Normalizer2 *nfkcNorm2;379MlBreakEngine *fMlBreakEngine;380bool isCj;381382private:383// Load Japanese extensions.384void loadJapaneseExtensions(UErrorCode& error);385// Load Japanese Hiragana.386void loadHiragana(UErrorCode& error);387// Initialize fSkipSet by loading Japanese Hiragana and extensions.388void initJapanesePhraseParameter(UErrorCode& error);389390Hashtable fSkipSet;391392public:393394/**395* <p>Default constructor.</p>396*397* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the398* engine is deleted. The DictionaryMatcher must contain costs for each word399* in order for the dictionary to work properly.400*/401CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);402403/**404* <p>Virtual destructor.</p>405*/406virtual ~CjkBreakEngine();407408protected:409/**410* <p>Divide up a range of known dictionary characters handled by this break engine.</p>411*412* @param text A UText representing the text413* @param rangeStart The start of the range of dictionary characters414* @param rangeEnd The end of the range of dictionary characters415* @param foundBreaks Output of C array of int32_t break positions, or 0416* @param status Information on any errors encountered.417* @return The number of breaks found418*/419virtual int32_t divideUpDictionaryRange( UText *text,420int32_t rangeStart,421int32_t rangeEnd,422UVector32 &foundBreaks,423UBool isPhraseBreaking,424UErrorCode& status) const override;425426};427428#endif429430U_NAMESPACE_END431432/* DICTBE_H */433#endif434435436