CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Data/Encoding/Utf8.h
Views: 1401
/*1Basic UTF-8 manipulation routines2by Jeff Bezanson3placed in the public domain Fall 200545This code is designed to provide the utilities you need to manipulate6UTF-8 as an internal string encoding. These functions do not perform the7error checking normally needed when handling UTF-8 data, so if you happen8to be from the Unicode Consortium you will want to flay me alive.9I do this because error checking can be performed at the boundaries (I/O),10with these routines reserved for higher performance on data known to be11valid.12*/1314// Further modified, and C++ stuff added, by [email protected].1516#pragma once1718#include <cstdint>19#include <cstring>20#include <string>21#include <string_view>2223uint32_t u8_nextchar(const char *s, int *i, size_t size);24uint32_t u8_nextchar_unsafe(const char *s, int *i);25int u8_wc_toutf8(char *dest, uint32_t ch);26void u8_inc(const char *s, int *i);27void u8_dec(const char *s, int *i);2829inline bool CodepointIsProbablyEmoji(uint32_t c) {30// Original check was some ranges grabbed from https://stackoverflow.com/a/62898106.31// But let's just go with checking if outside the BMP, it's not a big deal if we accidentally32// switch to color when not needed if someone uses a weird glyph.33return c > 0xFFFF;34}3536bool AnyEmojiInString(std::string_view str, size_t byteCount);3738class UTF8 {39public:40static const uint32_t INVALID = (uint32_t)-1;41// TODO: Try to get rid of this constructor.42explicit UTF8(const char *c) : c_(c), size_((int)strlen(c)), index_(0) {}43explicit UTF8(std::string_view view) : c_(view.data()), size_((int)view.size()), index_(0) {}44explicit UTF8(std::string_view view, int index) : c_(view.data()), size_((int)view.size()), index_(index) {}45bool end() const { return index_ == size_; }46// Returns true if the next character is outside BMP and Planes 1 - 16.47bool invalid() const {48unsigned char c = (unsigned char)c_[index_];49return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;50}51uint32_t next() {52return u8_nextchar(c_, &index_, size_);53}54// Allow invalid continuation bytes.55uint32_t next_unsafe() {56return u8_nextchar_unsafe(c_, &index_);57}58uint32_t peek() const {59int tempIndex = index_;60return u8_nextchar(c_, &tempIndex, size_);61}62void fwd() {63u8_inc(c_, &index_);64}65void bwd() {66u8_dec(c_, &index_);67}68int length() const {69return size_;70}71int byteIndex() const {72return index_;73}74static int encode(char *dest, uint32_t ch) {75return u8_wc_toutf8(dest, ch);76}77static int encodeUnits(uint32_t ch) {78if (ch < 0x80) {79return 1;80} else if (ch < 0x800) {81return 2;82} else if (ch < 0x10000) {83return 3;84} else if (ch < 0x110000) {85return 4;86}87return 0;88}8990private:91const char *c_;92int index_;93int size_;94};9596int UTF8StringNonASCIICount(std::string_view utf8string);9798bool UTF8StringHasNonASCII(std::string_view utf8string);99100101// Removes overlong encodings and similar.102std::string SanitizeUTF8(std::string_view utf8string);103std::string CodepointToUTF8(uint32_t codePoint);104105106// UTF8 to Win32 UTF-16107// Should be used when calling Win32 api calls108#ifdef _WIN32109110std::string ConvertWStringToUTF8(const std::wstring &wstr);111std::string ConvertWStringToUTF8(const wchar_t *wstr);112void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, std::string_view source);113std::wstring ConvertUTF8ToWString(std::string_view source);114115#else116117// Used by SymbolMap/assembler118std::wstring ConvertUTF8ToWString(std::string_view source);119std::string ConvertWStringToUTF8(const std::wstring &wstr);120121#endif122123std::string ConvertUCS2ToUTF8(const std::u16string &wstr);124125// Dest size in units, not bytes.126void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, std::string_view source);127std::u16string ConvertUTF8ToUCS2(std::string_view source);128129130