Path: blob/master/waterbox/libc/functions/_PDCLIB/_PDCLIB_utf8.c
2 views
/* UTF-8 codec12This file is part of the Public Domain C Library (PDCLib).3Permission is granted to use, modify, and / or redistribute at will.4*/56#ifndef REGTEST7#include <stdbool.h>8#include <stdint.h>9#include <uchar.h>10#include <assert.h>11#include "_PDCLIB_encoding.h"1213/* Use of the mbstate:14*15* _StUC[0] is the current decoding state16* _St32[1] is the character accumulated so far17*/1819static bool utf8_mbsinit( const mbstate_t *p_s )20{ return p_s->_StUC[0] == 0; }2122enum {23DecStart = 0,2425Dec2B2,2627Dec3B2,28Dec3B3,2930Dec4B2,31Dec4B3,32Dec4B433};3435#define state (p_s->_StUC[0])36#define accum (p_s->_St32[1])3738#define START_CONVERSION \39bool result = true; \4041#define END_CONVERSION \42end_conversion: \43return result4445#define FINISH(_r) do { \46result = (_r); \47goto end_conversion; \48} while(0)4950#define OUT32(_c) do { \51if(p_outbuf) \52(*((*p_outbuf)++)) = (_c); \53(*p_outsz)--; \54_PDCLIB_UNDEFINED(accum); \55state = DecStart; \56} while(0)5758#define CHECK_CONTINUATION \59do { if((c & 0xC0) != 0x80) return false; } while(0)6061static bool utf8toc32(62char32_t *restrict *restrict p_outbuf,63size_t *restrict p_outsz,64const char *restrict *restrict p_inbuf,65size_t *restrict p_insz,66mbstate_t *restrict p_s67)68{69START_CONVERSION70while(*p_outsz && *p_insz) {71unsigned char c = **p_inbuf;72char32_t c32;73switch(state) {74case DecStart:75// 1 byte76if(c <= 0x7F) {77OUT32(c);78} else if(c <= 0xDF) {79accum = (c & 0x1F) << 6;80state = Dec2B2;81} else if(c <= 0xEF) {82accum = (c & 0x0F) << 12;83state = Dec3B2;84} else if(c <= 0xF4) {85accum = (c & 0x07) << 18;86state = Dec4B2;87} else {88// 5+byte sequence illegal89FINISH(false);90}91break;9293case Dec2B2:94CHECK_CONTINUATION;9596c32 = accum | (c & 0x3F);9798// Overlong sequence (e.g. NUL injection)99if(c32 <= 0x7F)100FINISH(false);101102OUT32(c32);103break;104105case Dec3B2:106CHECK_CONTINUATION;107accum |= (c & 0x3F) << 6;108state = Dec3B3;109break;110111case Dec3B3:112CHECK_CONTINUATION;113114c32 = accum | (c & 0x3F);115116// Overlong117if(c32 <= 0x07FF)118FINISH(false);119120// Surrogate121if(c32 >= 0xD800 && c32 <= 0xDFFF)122FINISH(false);123124OUT32(c32);125break;126127case Dec4B2:128CHECK_CONTINUATION;129accum |= (c & 0x3F) << 12;130state = Dec4B3;131break;132133case Dec4B3:134CHECK_CONTINUATION;135accum |= (c & 0x3F) << 6;136state = Dec4B4;137break;138139case Dec4B4:140CHECK_CONTINUATION;141142c32 = accum | (c & 0x3F);143144// Overlong145if(c32 <= 0xFFFF) FINISH(false);146147// Not in Unicode148if(c32 > 0x10FFFF) FINISH(false);149150OUT32(c32);151break;152153default:154assert(!"Invalid state");155}156157(*p_inbuf)++;158(*p_insz)--;159}160END_CONVERSION;161}162163enum {164EncStart = 0,165Enc1R,166Enc2R,167Enc3R,168};169170static bool c32toutf8(171char *restrict *restrict p_outbuf,172size_t *restrict p_outsz,173const char32_t *restrict *restrict p_inbuf,174size_t *restrict p_insz,175mbstate_t *restrict p_s176)177{178START_CONVERSION179while(*p_outsz) {180unsigned char outc = 0;181switch(state) {182case Enc3R:183outc = 0x80 | ((accum >> 12) & 0x3F);184state = Enc2R;185break;186187case Enc2R:188outc = 0x80 | ((accum >> 6) & 0x3F);189state = Enc1R;190break;191192case Enc1R:193outc = 0x80 | (accum & 0x3F);194state = EncStart;195_PDCLIB_UNDEFINED(accum);196break;197198case EncStart:199if(*p_insz == 0)200FINISH(true);201202accum = **p_inbuf;203(*p_inbuf)++;204(*p_insz)--;205206if(accum <= 0x7F) {207outc = accum;208state = EncStart;209_PDCLIB_UNDEFINED(accum);210} else if(accum <= 0x7FF) {211outc = 0xC0 | (accum >> 6);212state = Enc1R;213} else if(accum <= 0xFFFF) {214outc = 0xE0 | (accum >> 12);215state = Enc2R;216} else if(accum <= 0x10FFFF) {217outc = 0xF0 | (accum >> 18);218state = Enc3R;219} else {220FINISH(false);221}222break;223}224225if(p_outbuf) {226**p_outbuf = outc;227(*p_outbuf)++;228}229(*p_outsz)--;230}231END_CONVERSION;232}233234const struct _PDCLIB_charcodec_t _PDCLIB_utf8_codec = {235.__mbsinit = utf8_mbsinit,236.__mbstoc32s = utf8toc32,237.__c32stombs = c32toutf8,238.__mb_max = 4,239};240241#endif242243#ifdef TEST244#include "_PDCLIB_test.h"245246int main( void )247{248#ifndef REGTEST249// Valid conversion & back250251static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"252"\xF4\x8F\xBF\xBF";253254char32_t c32out[8];255256char32_t *c32ptr = &c32out[0];257size_t c32rem = 8;258const char *chrptr = (char*) &input[0];259size_t chrrem = strlen(input);260mbstate_t mbs = { 0 };261262TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));263TESTCASE(c32rem == 0);264TESTCASE(chrrem == 0);265TESTCASE(c32ptr == &c32out[8]);266TESTCASE(chrptr == &input[strlen(input)]);267TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&268c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&269c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);270271char chrout[strlen(input)];272c32ptr = &c32out[0];273c32rem = 8;274chrptr = &chrout[0];275chrrem = strlen(input);276TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));277TESTCASE(c32rem == 0);278TESTCASE(chrrem == 0);279TESTCASE(c32ptr == &c32out[8]);280TESTCASE(chrptr == &chrout[strlen(input)]);281TESTCASE(memcmp(chrout, input, strlen(input)) == 0);282283// Multi-part conversion284static const char* mpinput = "\xDF\xBF";285c32ptr = &c32out[0];286c32rem = 8;287chrptr = &mpinput[0];288chrrem = 1;289TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));290TESTCASE(c32ptr == &c32out[0]);291TESTCASE(c32rem == 8);292TESTCASE(chrptr == &mpinput[1]);293TESTCASE(chrrem == 0);294chrrem = 1;295TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));296TESTCASE(c32ptr == &c32out[1]);297TESTCASE(c32rem == 7);298TESTCASE(chrptr == &mpinput[2]);299TESTCASE(chrrem == 0);300301// Invalid conversions302303// Overlong nuls304const char* nul2 = "\xC0\x80";305c32ptr = &c32out[0];306c32rem = 8;307chrptr = &nul2[0];308chrrem = 2;309TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);310memset(&mbs, 0, sizeof mbs);311const char* nul3 = "\xE0\x80\x80";312c32ptr = &c32out[0];313c32rem = 8;314chrptr = &nul3[0];315chrrem = 3;316TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);317memset(&mbs, 0, sizeof mbs);318const char* nul4 = "\xF0\x80\x80\x80";319c32ptr = &c32out[0];320c32rem = 8;321chrptr = &nul4[0];322chrrem = 4;323TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);324325// Starting on a continuation326const char* cont = "\x80";327c32ptr = &c32out[0];328c32rem = 8;329chrptr = &cont[0];330chrrem = 1;331TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);332#endif333return TEST_RESULTS;334}335336#endif337338339340