CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Data/Encoding/Utf16.h
Views: 1401
#pragma once12#include <cstdint>34#include "Common/BitSet.h"56// Should optimize out.7#define UTF16_IS_LITTLE_ENDIAN (*(const uint16_t *)"\0\xff" >= 0x100)89template <bool is_little>10uint16_t UTF16_Swap(uint16_t u) {11if (is_little) {12return UTF16_IS_LITTLE_ENDIAN ? u : swap16(u);13} else {14return UTF16_IS_LITTLE_ENDIAN ? swap16(u) : u;15}16}1718template <bool is_little>19struct UTF16_Type {20public:21static const char32_t INVALID = (char32_t)-1;2223UTF16_Type(const char16_t *c) : c_(c), index_(0) {}2425char32_t next() {26const char32_t u = UTF16_Swap<is_little>(c_[index_++]);2728// Surrogate pair. UTF-16 is so simple. We assume it's valid.29if ((u & 0xF800) == 0xD800) {30return 0x10000 + (((u & 0x3FF) << 10) | (UTF16_Swap<is_little>(c_[index_++]) & 0x3FF));31}32return u;33}3435bool end() const {36return c_[index_] == 0;37}3839int length() const {40int len = 0;41for (UTF16_Type<is_little> dec(c_); !dec.end(); dec.next())42++len;43return len;44}4546int shortIndex() const {47return index_;48}4950static int encode(char16_t *dest, char32_t u) {51if (u >= 0x10000) {52u -= 0x10000;53*dest++ = UTF16_Swap<is_little>(0xD800 + ((u >> 10) & 0x3FF));54*dest = UTF16_Swap<is_little>(0xDC00 + ((u >> 0) & 0x3FF));55return 2;56} else {57*dest = UTF16_Swap<is_little>((char16_t)u);58return 1;59}60}6162// Rejects non-UCS2 codepoints.63static int encodeUCS2(char16_t *dest, char32_t u) {64if (u >= 0x10000 || (u >= 0xD800 && u <= 0xDFFF)) {65return 0;66} else {67*dest = UTF16_Swap<is_little>((char16_t)u);68return 1;69}70}7172static int encodeUnits(char32_t u) {73if (u >= 0x10000) {74return 2;75} else {76return 1;77}78}7980static int encodeUnitsUCS2(char32_t u) {81if (u >= 0x10000 || (u >= 0xD800 && u <= 0xDFFF)) {82return 0;83} else {84return 1;85}86}87private:88const char16_t *c_;89int index_;90};9192typedef UTF16_Type<true> UTF16LE;93typedef UTF16_Type<false> UTF16BE;949596