// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/**3*******************************************************************************4* Copyright (C) 2006-2014, International Business Machines Corporation *5* and others. All Rights Reserved. *6*******************************************************************************7*/89#ifndef DICTBE_H10#define DICTBE_H1112#include "unicode/utypes.h"13#include "unicode/uniset.h"14#include "unicode/utext.h"1516#include "brkeng.h"17#include "hash.h"18#include "uvectr32.h"1920U_NAMESPACE_BEGIN2122class DictionaryMatcher;23class Normalizer2;2425/*******************************************************************26* DictionaryBreakEngine27*/2829/**30* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a31* dictionary to determine language-specific breaks.</p>32*33* <p>After it is constructed a DictionaryBreakEngine may be shared between34* threads without synchronization.</p>35*/36class DictionaryBreakEngine : public LanguageBreakEngine {37private:38/**39* The set of characters handled by this engine40* @internal41*/4243UnicodeSet fSet;4445public:4647/**48* <p>Constructor </p>49*/50DictionaryBreakEngine();5152/**53* <p>Virtual destructor.</p>54*/55virtual ~DictionaryBreakEngine();5657/**58* <p>Indicate whether this engine handles a particular character for59* a particular kind of break.</p>60*61* @param c A character which begins a run that the engine might handle62* @return true if this engine handles the particular character and break63* type.64*/65virtual UBool handles(UChar32 c) const override;6667/**68* <p>Find any breaks within a run in the supplied text.</p>69*70* @param text A UText representing the text. The iterator is left at71* the end of the run of characters which the engine is capable of handling72* that starts from the first character in the range.73* @param startPos The start of the run within the supplied text.74* @param endPos The end of the run within the supplied text.75* @param foundBreaks vector of int32_t to receive the break positions76* @param status Information on any errors encountered.77* @return The number of breaks found.78*/79virtual int32_t findBreaks( UText *text,80int32_t startPos,81int32_t endPos,82UVector32 &foundBreaks,83UBool isPhraseBreaking,84UErrorCode& status ) const override;8586protected:8788/**89* <p>Set the character set handled by this engine.</p>90*91* @param set A UnicodeSet of the set of characters handled by the engine92*/93virtual void setCharacters( const UnicodeSet &set );9495/**96* <p>Divide up a range of known dictionary characters handled by this break engine.</p>97*98* @param text A UText representing the text99* @param rangeStart The start of the range of dictionary characters100* @param rangeEnd The end of the range of dictionary characters101* @param foundBreaks Output of C array of int32_t break positions, or 0102* @param status Information on any errors encountered.103* @return The number of breaks found104*/105virtual int32_t divideUpDictionaryRange( UText *text,106int32_t rangeStart,107int32_t rangeEnd,108UVector32 &foundBreaks,109UBool isPhraseBreaking,110UErrorCode& status) const = 0;111112};113114/*******************************************************************115* ThaiBreakEngine116*/117118/**119* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a120* dictionary and heuristics to determine Thai-specific breaks.</p>121*122* <p>After it is constructed a ThaiBreakEngine may be shared between123* threads without synchronization.</p>124*/125class ThaiBreakEngine : public DictionaryBreakEngine {126private:127/**128* The set of characters handled by this engine129* @internal130*/131132UnicodeSet fEndWordSet;133UnicodeSet fBeginWordSet;134UnicodeSet fSuffixSet;135UnicodeSet fMarkSet;136DictionaryMatcher *fDictionary;137138public:139140/**141* <p>Default constructor.</p>142*143* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the144* engine is deleted.145*/146ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);147148/**149* <p>Virtual destructor.</p>150*/151virtual ~ThaiBreakEngine();152153protected:154/**155* <p>Divide up a range of known dictionary characters handled by this break engine.</p>156*157* @param text A UText representing the text158* @param rangeStart The start of the range of dictionary characters159* @param rangeEnd The end of the range of dictionary characters160* @param foundBreaks Output of C array of int32_t break positions, or 0161* @param status Information on any errors encountered.162* @return The number of breaks found163*/164virtual int32_t divideUpDictionaryRange( UText *text,165int32_t rangeStart,166int32_t rangeEnd,167UVector32 &foundBreaks,168UBool isPhraseBreaking,169UErrorCode& status) const override;170171};172173/*******************************************************************174* LaoBreakEngine175*/176177/**178* <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a179* dictionary and heuristics to determine Lao-specific breaks.</p>180*181* <p>After it is constructed a LaoBreakEngine may be shared between182* threads without synchronization.</p>183*/184class LaoBreakEngine : public DictionaryBreakEngine {185private:186/**187* The set of characters handled by this engine188* @internal189*/190191UnicodeSet fEndWordSet;192UnicodeSet fBeginWordSet;193UnicodeSet fMarkSet;194DictionaryMatcher *fDictionary;195196public:197198/**199* <p>Default constructor.</p>200*201* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the202* engine is deleted.203*/204LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);205206/**207* <p>Virtual destructor.</p>208*/209virtual ~LaoBreakEngine();210211protected:212/**213* <p>Divide up a range of known dictionary characters handled by this break engine.</p>214*215* @param text A UText representing the text216* @param rangeStart The start of the range of dictionary characters217* @param rangeEnd The end of the range of dictionary characters218* @param foundBreaks Output of C array of int32_t break positions, or 0219* @param status Information on any errors encountered.220* @return The number of breaks found221*/222virtual int32_t divideUpDictionaryRange( UText *text,223int32_t rangeStart,224int32_t rangeEnd,225UVector32 &foundBreaks,226UBool isPhraseBreaking,227UErrorCode& status) const override;228229};230231/*******************************************************************232* BurmeseBreakEngine233*/234235/**236* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a237* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>238*239* <p>After it is constructed a BurmeseBreakEngine may be shared between240* threads without synchronization.</p>241*/242class BurmeseBreakEngine : public DictionaryBreakEngine {243private:244/**245* The set of characters handled by this engine246* @internal247*/248249UnicodeSet fEndWordSet;250UnicodeSet fBeginWordSet;251UnicodeSet fMarkSet;252DictionaryMatcher *fDictionary;253254public:255256/**257* <p>Default constructor.</p>258*259* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the260* engine is deleted.261*/262BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);263264/**265* <p>Virtual destructor.</p>266*/267virtual ~BurmeseBreakEngine();268269protected:270/**271* <p>Divide up a range of known dictionary characters.</p>272*273* @param text A UText representing the text274* @param rangeStart The start of the range of dictionary characters275* @param rangeEnd The end of the range of dictionary characters276* @param foundBreaks Output of C array of int32_t break positions, or 0277* @param status Information on any errors encountered.278* @return The number of breaks found279*/280virtual int32_t divideUpDictionaryRange( UText *text,281int32_t rangeStart,282int32_t rangeEnd,283UVector32 &foundBreaks,284UBool isPhraseBreaking,285UErrorCode& status) const override;286287};288289/*******************************************************************290* KhmerBreakEngine291*/292293/**294* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a295* DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>296*297* <p>After it is constructed a KhmerBreakEngine may be shared between298* threads without synchronization.</p>299*/300class KhmerBreakEngine : public DictionaryBreakEngine {301private:302/**303* The set of characters handled by this engine304* @internal305*/306307UnicodeSet fEndWordSet;308UnicodeSet fBeginWordSet;309UnicodeSet fMarkSet;310DictionaryMatcher *fDictionary;311312public:313314/**315* <p>Default constructor.</p>316*317* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the318* engine is deleted.319*/320KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);321322/**323* <p>Virtual destructor.</p>324*/325virtual ~KhmerBreakEngine();326327protected:328/**329* <p>Divide up a range of known dictionary characters.</p>330*331* @param text A UText representing the text332* @param rangeStart The start of the range of dictionary characters333* @param rangeEnd The end of the range of dictionary characters334* @param foundBreaks Output of C array of int32_t break positions, or 0335* @param status Information on any errors encountered.336* @return The number of breaks found337*/338virtual int32_t divideUpDictionaryRange( UText *text,339int32_t rangeStart,340int32_t rangeEnd,341UVector32 &foundBreaks,342UBool isPhraseBreaking,343UErrorCode& status) const override;344345};346347#if !UCONFIG_NO_NORMALIZATION348349/*******************************************************************350* CjkBreakEngine351*/352353//indicates language/script that the CjkBreakEngine will handle354enum LanguageType {355kKorean,356kChineseJapanese357};358359/**360* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a361* dictionary with costs associated with each word and362* Viterbi decoding to determine CJK-specific breaks.</p>363*/364class CjkBreakEngine : public DictionaryBreakEngine {365protected:366/**367* The set of characters handled by this engine368* @internal369*/370UnicodeSet fHangulWordSet;371UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;372UnicodeSet fClosePunctuationSet;373374DictionaryMatcher *fDictionary;375const Normalizer2 *nfkcNorm2;376377private:378// Load Japanese extensions.379void loadJapaneseExtensions(UErrorCode& error);380// Load Japanese Hiragana.381void loadHiragana(UErrorCode& error);382// Initialize fSkipSet by loading Japanese Hiragana and extensions.383void initJapanesePhraseParameter(UErrorCode& error);384385Hashtable fSkipSet;386387public:388389/**390* <p>Default constructor.</p>391*392* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the393* engine is deleted. The DictionaryMatcher must contain costs for each word394* in order for the dictionary to work properly.395*/396CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);397398/**399* <p>Virtual destructor.</p>400*/401virtual ~CjkBreakEngine();402403protected:404/**405* <p>Divide up a range of known dictionary characters handled by this break engine.</p>406*407* @param text A UText representing the text408* @param rangeStart The start of the range of dictionary characters409* @param rangeEnd The end of the range of dictionary characters410* @param foundBreaks Output of C array of int32_t break positions, or 0411* @param status Information on any errors encountered.412* @return The number of breaks found413*/414virtual int32_t divideUpDictionaryRange( UText *text,415int32_t rangeStart,416int32_t rangeEnd,417UVector32 &foundBreaks,418UBool isPhraseBreaking,419UErrorCode& status) const override;420421};422423#endif424425U_NAMESPACE_END426427/* DICTBE_H */428#endif429430431