Path: blob/master/thirdparty/icu4c/common/dictionarydata.h
9902 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2014, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* dictionarydata.h8*9* created on: 2012may3110* created by: Markus W. Scherer & Maxime Serrano11*/1213#ifndef __DICTIONARYDATA_H__14#define __DICTIONARYDATA_H__1516#include "unicode/utypes.h"1718#if !UCONFIG_NO_BREAK_ITERATION1920#include "unicode/utext.h"21#include "unicode/udata.h"22#include "udataswp.h"23#include "unicode/uobject.h"24#include "unicode/ustringtrie.h"2526U_NAMESPACE_BEGIN2728class UCharsTrie;29class BytesTrie;3031class U_COMMON_API DictionaryData : public UMemory {32public:33static const int32_t TRIE_TYPE_BYTES; // = 0;34static const int32_t TRIE_TYPE_UCHARS; // = 1;35static const int32_t TRIE_TYPE_MASK; // = 7;36static const int32_t TRIE_HAS_VALUES; // = 8;3738static const int32_t TRANSFORM_NONE; // = 0;39static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000;40static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000;41static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff;4243enum {44// Byte offsets from the start of the data, after the generic header.45IX_STRING_TRIE_OFFSET,46IX_RESERVED1_OFFSET,47IX_RESERVED2_OFFSET,48IX_TOTAL_SIZE,4950// Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.51IX_TRIE_TYPE,52// Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.53IX_TRANSFORM,5455IX_RESERVED6,56IX_RESERVED7,57IX_COUNT58};59};6061/**62* Wrapper class around generic dictionaries, implementing matches().63* getType() should return a TRIE_TYPE_??? constant from DictionaryData.64*65* All implementations of this interface must be thread-safe if they are to be used inside of the66* dictionary-based break iteration code.67*/68class U_COMMON_API DictionaryMatcher : public UMemory {69public:70DictionaryMatcher() {}71virtual ~DictionaryMatcher();72// this should emulate CompactTrieDictionary::matches()73/* @param text The text in which to look for matching words. Matching begins74* at the current position of the UText.75* @param maxLength The max length of match to consider. Units are the native indexing76* units of the UText.77* @param limit Capacity of output arrays, which is also the maximum number of78* matching words to be found.79* @param lengths output array, filled with the lengths of the matches, in order,80* from shortest to longest. Lengths are in native indexing units81* of the UText. May be nullptr.82* @param cpLengths output array, filled with the lengths of the matches, in order,83* from shortest to longest. Lengths are the number of Unicode code points.84* May be nullptr.85* @param values Output array, filled with the values associated with the words found.86* May be nullptr.87* @param prefix Output parameter, the code point length of the prefix match, even if that88* prefix didn't lead to a complete word. Will always be >= the cpLength89* of the longest complete word matched. May be nullptr.90* @return Number of matching words found.91*/92virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,93int32_t *lengths, int32_t *cpLengths, int32_t *values,94int32_t *prefix) const = 0;9596/** @return DictionaryData::TRIE_TYPE_XYZ */97virtual int32_t getType() const = 0;98};99100// Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary101class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {102public:103// constructs a new UCharsDictionaryMatcher.104// The UDataMemory * will be closed on this object's destruction.105UCharsDictionaryMatcher(const char16_t *c, UDataMemory *f) : characters(c), file(f) { }106virtual ~UCharsDictionaryMatcher();107virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,108int32_t *lengths, int32_t *cpLengths, int32_t *values,109int32_t *prefix) const override;110virtual int32_t getType() const override;111private:112const char16_t *characters;113UDataMemory *file;114};115116// Implementation of the DictionaryMatcher interface for a BytesTrie dictionary117class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {118public:119// constructs a new BytesTrieDictionaryMatcher120// the transform constant should be the constant read from the file, not a masked version!121// the UDataMemory * fed in here will be closed on this object's destruction122BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)123: characters(c), transformConstant(t), file(f) { }124virtual ~BytesDictionaryMatcher();125virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,126int32_t *lengths, int32_t *cpLengths, int32_t *values,127int32_t *prefix) const override;128virtual int32_t getType() const override;129private:130UChar32 transform(UChar32 c) const;131132const char *characters;133int32_t transformConstant;134UDataMemory *file;135};136137U_NAMESPACE_END138139U_CAPI int32_t U_EXPORT2140udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);141142/**143* Format of dictionary .dict data files.144* Format version 1.0.145*146* A dictionary .dict data file contains a byte-serialized BytesTrie or147* a UChars-serialized UCharsTrie.148* Such files are used in dictionary-based break iteration (DBBI).149*150* For a BytesTrie, a transformation type is specified for151* transforming Unicode strings into byte sequences.152*153* A .dict file begins with a standard ICU data file header154* (DataHeader, see ucmndata.h and unicode/udata.h).155* The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).156*157* After the header, the file contains the following parts.158* Constants are defined in the DictionaryData class.159*160* For the data structure of BytesTrie & UCharsTrie see161* https://icu.unicode.org/design/struct/tries162* and the bytestrie.h and ucharstrie.h header files.163*164* int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;165*166* The first four indexes are byte offsets in ascending order.167* Each byte offset marks the start of the next part in the data file,168* and the end of the previous one.169* When two consecutive byte offsets are the same, then the corresponding part is empty.170* Byte offsets are offsets from after the header,171* that is, from the beginning of the indexes[].172* Each part starts at an offset with proper alignment for its data.173* If necessary, the previous part may include padding bytes to achieve this alignment.174*175* trieType=indexes[IX_TRIE_TYPE] defines the trie type.176* transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.177* If the transformation type is TRANSFORM_TYPE_OFFSET,178* then the lower 21 bits contain the offset code point.179* Each code point c is mapped to byte b = (c - offset).180* Code points outside the range offset..(offset+0xff) cannot be mapped181* and do not occur in the dictionary.182*183* stringTrie; -- a serialized BytesTrie or UCharsTrie184*185* The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),186* or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).187*/188189#endif /* !UCONFIG_NO_BREAK_ITERATION */190#endif /* __DICTIONARYDATA_H__ */191192193