// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3******************************************************************************4*5* Copyright (C) 2007, International Business Machines6* Corporation and others. All Rights Reserved.7*8******************************************************************************9* file name: bmpset.h10* encoding: UTF-811* tab size: 8 (not used)12* indentation:413*14* created on: 2007jan2915* created by: Markus W. Scherer16*/1718#ifndef __BMPSET_H__19#define __BMPSET_H__2021#include "unicode/utypes.h"22#include "unicode/uniset.h"2324U_NAMESPACE_BEGIN2526/*27* Helper class for frozen UnicodeSets, implements contains() and span()28* optimized for BMP code points. Structured to be UTF-8-friendly.29*30* Latin-1: Look up bytes.31* 2-byte characters: Bits organized vertically.32* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,33* with mixed for illegal ranges.34* Supplementary characters: Binary search over35* the supplementary part of the parent set's inversion list.36*/37class BMPSet : public UMemory {38public:39BMPSet(const int32_t *parentList, int32_t parentListLength);40BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);41virtual ~BMPSet();4243virtual UBool contains(UChar32 c) const;4445/*46* Span the initial substring for which each character c has spanCondition==contains(c).47* It must be s<limit and spanCondition==0 or 1.48* @return The string pointer which limits the span.49*/50const char16_t *span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;5152/*53* Span the trailing substring for which each character c has spanCondition==contains(c).54* It must be s<limit and spanCondition==0 or 1.55* @return The string pointer which starts the span.56*/57const char16_t *spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;5859/*60* Span the initial substring for which each character c has spanCondition==contains(c).61* It must be length>0 and spanCondition==0 or 1.62* @return The string pointer which limits the span.63*/64const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;6566/*67* Span the trailing substring for which each character c has spanCondition==contains(c).68* It must be length>0 and spanCondition==0 or 1.69* @return The start of the span.70*/71int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;7273private:74void initBits();75void overrideIllegal();7677/**78* Same as UnicodeSet::findCodePoint(UChar32 c) const except that the79* binary search is restricted for finding code points in a certain range.80*81* For restricting the search for finding in the range start..end,82* pass in83* lo=findCodePoint(start) and84* hi=findCodePoint(end)85* with 0<=lo<=hi<len.86* findCodePoint(c) defaults to lo=0 and hi=len-1.87*88* @param c a character in a subrange of MIN_VALUE..MAX_VALUE89* @param lo The lowest index to be returned.90* @param hi The highest index to be returned.91* @return the smallest integer i in the range lo..hi,92* inclusive, such that c < list[i]93*/94int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;9596inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;9798/*99* One byte 0 or 1 per Latin-1 character.100*/101UBool latin1Contains[0x100];102103/* true if contains(U+FFFD). */104UBool containsFFFD;105106/*107* One bit per code point from U+0000..U+07FF.108* The bits are organized vertically; consecutive code points109* correspond to the same bit positions in consecutive table words.110* With code point parts111* lead=c{10..6}112* trail=c{5..0}113* it is set.contains(c)==(table7FF[trail] bit lead)114*115* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)116* for faster validity checking at runtime.117*/118uint32_t table7FF[64];119120/*121* One bit per 64 BMP code points.122* The bits are organized vertically; consecutive 64-code point blocks123* correspond to the same bit position in consecutive table words.124* With code point parts125* lead=c{15..12}126* t1=c{11..6}127* test bits (lead+16) and lead in bmpBlockBits[t1].128* If the upper bit is 0, then the lower bit indicates if contains(c)129* for all code points in the 64-block.130* If the upper bit is 1, then the block is mixed and set.contains(c)131* must be called.132*133* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to134* the result of contains(FFFD) for faster validity checking at runtime.135*/136uint32_t bmpBlockBits[64];137138/*139* Inversion list indexes for restricted binary searches in140* findCodePoint(), from141* findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).142* U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are143* always looked up in the bit tables.144* The last pair of indexes is for finding supplementary code points.145*/146int32_t list4kStarts[18];147148/*149* The inversion list of the parent set, for the slower contains() implementation150* for mixed BMP blocks and for supplementary code points.151* The list is terminated with list[listLength-1]=0x110000.152*/153const int32_t *list;154int32_t listLength;155};156157inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {158return findCodePoint(c, lo, hi) & 1;159}160161U_NAMESPACE_END162163#endif164165166