CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/Data/Encoding/Utf8.cpp
Views: 1401
/*1Basic UTF-8 manipulation routines2by Jeff Bezanson3placed in the public domain Fall 200545This code is designed to provide the utilities you need to manipulate6UTF-8 as an internal string encoding. These functions do not perform the7error checking normally needed when handling UTF-8 data, so if you happen8to be from the Unicode Consortium you will want to flay me alive.9I do this because error checking can be performed at the boundaries (I/O),10with these routines reserved for higher performance on data known to be11valid.12*/1314#ifdef _WIN3215#include <windows.h>16#undef min17#undef max18#endif1920#include <cstdlib>21#include <cstdio>22#include <cstring>23#include <cstdarg>24#include <cstdint>2526#include <algorithm>27#include <string>2829#include "Common/Data/Encoding/Utf8.h"30#include "Common/Data/Encoding/Utf16.h"31#include "Common/Log.h"3233// is start of UTF sequence34inline bool isutf(char c) {35return (c & 0xC0) != 0x80;36}3738static const uint32_t offsetsFromUTF8[6] = {390x00000000UL, 0x00003080UL, 0x000E2080UL,400x03C82080UL, 0xFA082080UL, 0x82082080UL41};4243static const uint8_t trailingBytesForUTF8[256] = {440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,460,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,480,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,490,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,501,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,512,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,52};5354/* returns length of next utf-8 sequence */55int u8_seqlen(const char *s)56{57return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;58}5960/* conversions without error checking61only works for valid UTF-8, i.e. no 5- or 6-byte sequences62srcsz = source size in bytes, or -1 if 0-terminated63sz = dest size in # of wide characters6465returns # characters converted66dest will always be L'\0'-terminated, even if there isn't enough room67for all the characters.68if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.69*/70int u8_toucs(uint32_t *dest, int sz, const char *src, int srcsz)71{72uint32_t ch;73const char *src_end = src + srcsz;74int nb;75int i=0;7677while (i < sz-1) {78nb = trailingBytesForUTF8[(unsigned char)*src];79if (srcsz == -1) {80if (*src == 0)81goto done_toucs;82}83else {84if (src + nb >= src_end)85goto done_toucs;86}87ch = 0;88switch (nb) {89/* these fall through deliberately */90case 3: ch += (unsigned char)*src++; ch <<= 6;91case 2: ch += (unsigned char)*src++; ch <<= 6;92case 1: ch += (unsigned char)*src++; ch <<= 6;93case 0: ch += (unsigned char)*src++;94}95ch -= offsetsFromUTF8[nb];96dest[i++] = ch;97}98done_toucs:99dest[i] = 0;100return i;101}102103/* srcsz = number of source characters, or -1 if 0-terminated104sz = size of dest buffer in bytes105106returns # characters converted107dest will only be '\0'-terminated if there is enough space. this is108for consistency; imagine there are 2 bytes of space left, but the next109character requires 3 bytes. in this case we could NUL-terminate, but in110general we can't when there's insufficient space. therefore this function111only NUL-terminates if all the characters fit, and there's space for112the NUL as well.113the destination string will never be bigger than the source string.114*/115int u8_toutf8(char *dest, int sz, const uint32_t *src, int srcsz)116{117uint32_t ch;118int i = 0;119char *dest_end = dest + sz;120121while (srcsz<0 ? src[i]!=0 : i < srcsz) {122ch = src[i];123if (ch < 0x80) {124if (dest >= dest_end)125return i;126*dest++ = (char)ch;127}128else if (ch < 0x800) {129if (dest >= dest_end-1)130return i;131*dest++ = (ch>>6) | 0xC0;132*dest++ = (ch & 0x3F) | 0x80;133}134else if (ch < 0x10000) {135if (dest >= dest_end-2)136return i;137*dest++ = (ch>>12) | 0xE0;138*dest++ = ((ch>>6) & 0x3F) | 0x80;139*dest++ = (ch & 0x3F) | 0x80;140}141else if (ch < 0x110000) {142if (dest >= dest_end-3)143return i;144*dest++ = (ch>>18) | 0xF0;145*dest++ = ((ch>>12) & 0x3F) | 0x80;146*dest++ = ((ch>>6) & 0x3F) | 0x80;147*dest++ = (ch & 0x3F) | 0x80;148}149i++;150}151if (dest < dest_end)152*dest = '\0';153return i;154}155156int u8_wc_toutf8(char *dest, uint32_t ch)157{158if (ch < 0x80) {159dest[0] = (char)ch;160return 1;161}162if (ch < 0x800) {163dest[0] = (ch>>6) | 0xC0;164dest[1] = (ch & 0x3F) | 0x80;165return 2;166}167if (ch < 0x10000) {168dest[0] = (ch>>12) | 0xE0;169dest[1] = ((ch>>6) & 0x3F) | 0x80;170dest[2] = (ch & 0x3F) | 0x80;171return 3;172}173if (ch < 0x110000) {174dest[0] = (ch>>18) | 0xF0;175dest[1] = ((ch>>12) & 0x3F) | 0x80;176dest[2] = ((ch>>6) & 0x3F) | 0x80;177dest[3] = (ch & 0x3F) | 0x80;178return 4;179}180return 0;181}182183/* charnum => byte offset */184int u8_offset(const char *str, int charnum)185{186int offs=0;187188while (charnum > 0 && str[offs]) {189(void)(isutf(str[++offs]) || isutf(str[++offs]) ||190isutf(str[++offs]) || ++offs);191charnum--;192}193return offs;194}195196/* byte offset => charnum */197int u8_charnum(const char *s, int offset)198{199int charnum = 0, offs=0;200201while (offs < offset && s[offs]) {202(void)(isutf(s[++offs]) || isutf(s[++offs]) ||203isutf(s[++offs]) || ++offs);204charnum++;205}206return charnum;207}208209/* reads the next utf-8 sequence out of a string, updating an index */210uint32_t u8_nextchar(const char *s, int *index, size_t size) {211uint32_t ch = 0;212_dbg_assert_(*index >= 0 && *index < 100000000);213int sz = 0;214int i = *index;215do {216ch = (ch << 6) + (unsigned char)s[i++];217sz++;218} while (i < size && s[i] && ((s[i]) & 0xC0) == 0x80);219*index = i;220return ch - offsetsFromUTF8[sz - 1];221}222223uint32_t u8_nextchar_unsafe(const char *s, int *i) {224uint32_t ch = (unsigned char)s[(*i)++];225int sz = 1;226if (ch >= 0xF0) {227sz++;228ch &= ~0x10;229}230if (ch >= 0xE0) {231sz++;232ch &= ~0x20;233}234if (ch >= 0xC0) {235sz++;236ch &= ~0xC0;237}238239// Just assume the bytes must be there. This is the logic used on the PSP.240for (int j = 1; j < sz; ++j) {241ch <<= 6;242ch += ((unsigned char)s[(*i)++]) & 0x3F;243}244return ch;245}246247void u8_inc(const char *s, int *i)248{249(void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||250isutf(s[++(*i)]) || ++(*i));251}252253void u8_dec(const char *s, int *i)254{255(void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||256isutf(s[--(*i)]) || --(*i));257}258259int octal_digit(char c)260{261return (c >= '0' && c <= '7');262}263264int hex_digit(char c)265{266return ((c >= '0' && c <= '9') ||267(c >= 'A' && c <= 'F') ||268(c >= 'a' && c <= 'f'));269}270271/* assumes that src points to the character after a backslash272returns number of input characters processed */273int u8_read_escape_sequence(const char *str, uint32_t *dest)274{275long ch;276char digs[9]="\0\0\0\0\0\0\0\0";277int dno=0, i=1;278279ch = (uint32_t)str[0]; /* take literal character */280if (str[0] == 'n')281ch = L'\n';282else if (str[0] == 't')283ch = L'\t';284else if (str[0] == 'r')285ch = L'\r';286else if (str[0] == 'b')287ch = L'\b';288else if (str[0] == 'f')289ch = L'\f';290else if (str[0] == 'v')291ch = L'\v';292else if (str[0] == 'a')293ch = L'\a';294else if (octal_digit(str[0])) {295i = 0;296do {297digs[dno++] = str[i++];298} while (octal_digit(str[i]) && dno < 3);299ch = strtol(digs, NULL, 8);300}301else if (str[0] == 'x') {302while (hex_digit(str[i]) && dno < 2) {303digs[dno++] = str[i++];304}305if (dno > 0)306ch = strtol(digs, NULL, 16);307}308else if (str[0] == 'u') {309while (hex_digit(str[i]) && dno < 4) {310digs[dno++] = str[i++];311}312if (dno > 0)313ch = strtol(digs, NULL, 16);314}315else if (str[0] == 'U') {316while (hex_digit(str[i]) && dno < 8) {317digs[dno++] = str[i++];318}319if (dno > 0)320ch = strtol(digs, NULL, 16);321}322*dest = (uint32_t)ch;323324return i;325}326327/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8328example: u8_unescape(mybuf, 256, "hello\\u220e")329note the double backslash is needed if called on a C string literal */330int u8_unescape(char *buf, int sz, char *src)331{332int c=0, amt;333uint32_t ch;334char temp[4];335336while (*src && c < sz) {337if (*src == '\\') {338src++;339amt = u8_read_escape_sequence(src, &ch);340}341else {342ch = (uint32_t)*src;343amt = 1;344}345src += amt;346amt = u8_wc_toutf8(temp, ch);347if (amt > sz-c)348break;349memcpy(&buf[c], temp, amt);350c += amt;351}352if (c < sz)353buf[c] = '\0';354return c;355}356357int u8_is_locale_utf8(const char *locale)358{359/* this code based on libutf8 */360const char* cp = locale;361362for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {363if (*cp == '.') {364const char* encoding = ++cp;365for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)366;367if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))368|| (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))369return 1; /* it's UTF-8 */370break;371}372}373return 0;374}375376bool AnyEmojiInString(std::string_view str, size_t byteCount) {377int i = 0;378while (i < byteCount) {379uint32_t c = u8_nextchar(str.data(), &i, str.size());380if (CodepointIsProbablyEmoji(c)) {381return true;382}383}384return false;385}386387int UTF8StringNonASCIICount(std::string_view utf8string) {388UTF8 utf(utf8string);389int count = 0;390while (!utf.end()) {391int c = utf.next();392if (c > 127)393++count;394}395return count;396}397398bool UTF8StringHasNonASCII(std::string_view utf8string) {399return UTF8StringNonASCIICount(utf8string) > 0;400}401402#ifdef _WIN32403404std::string ConvertWStringToUTF8(const wchar_t *wstr) {405int len = (int)wcslen(wstr);406int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr, len, 0, 0, NULL, NULL);407std::string s;408s.resize(size);409if (size > 0) {410WideCharToMultiByte(CP_UTF8, 0, wstr, len, &s[0], size, NULL, NULL);411}412return s;413}414415std::string ConvertWStringToUTF8(const std::wstring &wstr) {416int len = (int)wstr.size();417int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, 0, 0, NULL, NULL);418std::string s;419s.resize(size);420if (size > 0) {421WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, &s[0], size, NULL, NULL);422}423return s;424}425426void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, std::string_view source) {427int len = (int)source.size();428destSize -= 1; // account for the \0.429int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);430MultiByteToWideChar(CP_UTF8, 0, source.data(), len, dest, std::min((int)destSize, size));431dest[size] = 0;432}433434std::wstring ConvertUTF8ToWString(const std::string_view source) {435int len = (int)source.size();436int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);437std::wstring str;438str.resize(size);439if (size > 0) {440MultiByteToWideChar(CP_UTF8, 0, source.data(), (int)source.size(), &str[0], size);441}442return str;443}444445#endif446447std::string ConvertUCS2ToUTF8(const std::u16string &wstr) {448std::string s;449// Worst case.450s.resize(wstr.size() * 4);451452size_t pos = 0;453for (wchar_t c : wstr) {454pos += UTF8::encode(&s[pos], c);455}456457s.resize(pos);458return s;459}460461std::string SanitizeUTF8(std::string_view utf8string) {462UTF8 utf(utf8string);463std::string s;464// Worst case.465s.resize(utf8string.size() * 4);466467// This stops at invalid start bytes.468size_t pos = 0;469while (!utf.end() && !utf.invalid()) {470int c = utf.next_unsafe();471pos += UTF8::encode(&s[pos], c);472}473s.resize(pos);474return s;475}476477static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, std::string_view source) {478const char16_t *const orig = dest;479const char16_t *const destEnd = dest + destSize;480481UTF8 utf(source);482483char16_t *destw = (char16_t *)dest;484const char16_t *const destwEnd = destw + destSize;485486// Ignores characters outside the BMP.487while (uint32_t c = utf.next()) {488if (destw + UTF16LE::encodeUnitsUCS2(c) >= destwEnd) {489break;490}491destw += UTF16LE::encodeUCS2(destw, c);492}493494// No ++ to not count the null-terminator in length.495if (destw < destEnd) {496*destw = 0;497}498499return destw - orig;500}501502void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, const std::string &source) {503ConvertUTF8ToUCS2Internal(dest, destSize, source);504}505506std::u16string ConvertUTF8ToUCS2(std::string_view source) {507std::u16string dst;508// utf-8 won't be less bytes than there are characters.509dst.resize(source.size(), 0);510size_t realLen = ConvertUTF8ToUCS2Internal(&dst[0], source.size(), source);511dst.resize(realLen);512return dst;513}514515std::string CodepointToUTF8(uint32_t codePoint) {516char temp[16]{};517UTF8::encode(temp, codePoint);518return std::string(temp);519}520521#ifndef _WIN32522523// Replacements for the Win32 wstring functions. Not to be used from emulation code!524525std::string ConvertWStringToUTF8(const std::wstring &wstr) {526std::string s;527// Worst case.528s.resize(wstr.size() * 4);529530size_t pos = 0;531for (wchar_t c : wstr) {532pos += UTF8::encode(&s[pos], c);533}534535s.resize(pos);536return s;537}538539static size_t ConvertUTF8ToWStringInternal(wchar_t *dest, size_t destSize, std::string_view source) {540const wchar_t *const orig = dest;541const wchar_t *const destEnd = dest + destSize;542543UTF8 utf(source);544545if (sizeof(wchar_t) == 2) {546char16_t *destw = (char16_t *)dest;547const char16_t *const destwEnd = destw + destSize;548while (char32_t c = utf.next()) {549if (destw + UTF16LE::encodeUnits(c) >= destwEnd) {550break;551}552destw += UTF16LE::encode(destw, c);553}554dest = (wchar_t *)destw;555} else {556while (char32_t c = utf.next()) {557if (dest + 1 >= destEnd) {558break;559}560*dest++ = c;561}562}563564// No ++ to not count the terminal in length.565if (dest < destEnd) {566*dest = 0;567}568569return dest - orig;570}571572std::wstring ConvertUTF8ToWString(std::string_view source) {573std::wstring dst;574// conservative size estimate for wide characters from utf-8 bytes. Will always reserve too much space.575dst.resize(source.size());576size_t realLen = ConvertUTF8ToWStringInternal(&dst[0], source.size(), source);577dst.resize(realLen); // no need to write a NUL, it's done for us by resize.578return dst;579}580581#endif582583584