Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/tinyiconv/iconv.cpp
38771 views
/*1* Copyright (C) 2017 The Android Open Source Project2* All rights reserved.3*4* Redistribution and use in source and binary forms, with or without5* modification, are permitted provided that the following conditions6* are met:7* * Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9* * Redistributions in binary form must reproduce the above copyright10* notice, this list of conditions and the following disclaimer in11* the documentation and/or other materials provided with the12* distribution.13*14* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS15* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT16* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS17* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE18* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,19* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,20* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS21* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED22* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,23* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT24* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE.26*/2728#ifdef __ANDROID__2930#include <stdint.h>3132// for char16_t and char32_t33typedef uint32_t char32_t;34typedef uint16_t char16_t;3536#include <ctype.h>3738#include "iconv.h"39#include <endian.h>40#include <errno.h>41#include <stdlib.h>42#include <string.h>43#include <uchar.h>4445#include "bionic_mbstate.h"464748#ifdef __cplusplus49# define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)50#else // !__cplusplus51# define INVALID_ICONV_T (iconv_t)(-1)52#endif // __cplusplus5354// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something55// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're56// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.5758#ifdef __cplusplus59enum Encoding60#else61typedef enum62#endif // __cplusplus63{64US_ASCII,65UTF_8,66UTF_16_LE,67UTF_16_BE,68UTF_32_LE,69UTF_32_BE,70WCHAR_T,71#ifdef __cplusplus72};73#else74} Encoding;75#endif // __cplusplus7677#ifdef __cplusplus78enum Mode79#else80typedef enum81#endif // __cplusplus82{83ERROR,84IGNORE,85TRANSLIT,86#ifdef __cplusplus87};88#else89} Mode;90#endif // __cplusplus9192// This matching is strange but true.93// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.94static bool __match_encoding(const char* lhs, const char* rhs) {95while (*lhs && *rhs) {96// Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.97// Also implement the "delete each 0 that is not preceded by a digit" rule.98for (; *lhs; ++lhs) {99if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;100}101// Case doesn't matter either.102if (tolower(*lhs) != tolower(*rhs)) break;103++lhs;104++rhs;105}106// As a special case we treat the GNU "//" extensions as end of string.107if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;108return false;109}110111static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {112const char* suffix = strstr(s, "//");113if (suffix) {114if (!mode) return false;115if (strcmp(suffix, "//IGNORE") == 0) {116*mode = IGNORE;117} else if (strcmp(suffix, "//TRANSLIT") == 0) {118*mode = TRANSLIT;119} else {120return false;121}122}123if (__match_encoding(s, "utf8")) {124*encoding = UTF_8;125} else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {126*encoding = US_ASCII;127} else if (__match_encoding(s, "utf16le")) {128*encoding = UTF_16_LE;129} else if (__match_encoding(s, "utf16be")) {130*encoding = UTF_16_BE;131} else if (__match_encoding(s, "utf32le")) {132*encoding = UTF_32_LE;133} else if (__match_encoding(s, "utf32be")) {134*encoding = UTF_32_BE;135} else if (__match_encoding(s, "wchart")) {136*encoding = WCHAR_T;137} else {138return false;139}140return true;141}142143struct __iconv_t {144Encoding src_encoding;145Encoding dst_encoding;146Mode mode;147__iconv_t() : mode(ERROR) {148}149int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {150// Reset state.151wc = 0;152memset(&ps, 0, sizeof(ps));153replacement_count = 0;154ignored = false;155src_buf = src_buf0;156src_bytes_left = src_bytes_left0;157dst_buf = dst_buf0;158dst_bytes_left = dst_bytes_left0;159while (*src_bytes_left > 0) {160if (!GetNext() || !Convert()) return -1;161}162return Done();163}164private:165char32_t wc;166char buf[16];167size_t src_bytes_used;168size_t dst_bytes_used;169mbstate_t ps;170size_t replacement_count;171bool ignored;172char** src_buf;173size_t* src_bytes_left;174char** dst_buf;175size_t* dst_bytes_left;176bool GetNext() {177errno = 0;178switch (src_encoding) {179case US_ASCII:180wc = **src_buf;181src_bytes_used = 1;182if (wc > 0x7f) errno = EILSEQ;183break;184case UTF_8:185src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);186if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {187break; // EILSEQ already set.188} else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {189errno = EINVAL;190return false;191}192break;193case UTF_16_BE:194case UTF_16_LE: {195if (*src_bytes_left < 2) {196errno = EINVAL;197return false;198}199bool swap = (src_encoding == UTF_16_BE);200wc = In16(*src_buf, swap);201// 0xd800-0xdbff: high surrogates202// 0xdc00-0xdfff: low surrogates203if (wc >= 0xd800 && wc <= 0xdfff) {204if (wc >= 0xdc00) { // Low surrogate before high surrogate.205errno = EILSEQ;206return false;207}208if (*src_bytes_left < 4) {209errno = EINVAL;210return false;211}212uint16_t hi = wc;213uint16_t lo = In16(*src_buf + 2, swap);214wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);215src_bytes_used = 4;216}217break;218}219case UTF_32_BE:220case UTF_32_LE:221case WCHAR_T:222if (*src_bytes_left < 4) {223errno = EINVAL;224return false;225}226wc = In32(*src_buf, (src_encoding == UTF_32_BE));227break;228}229if (errno == EILSEQ) {230switch (mode) {231case ERROR:232return false;233case IGNORE:234*src_buf += src_bytes_used;235*src_bytes_left -= src_bytes_used;236ignored = true;237return GetNext();238case TRANSLIT:239wc = '?';240++replacement_count;241return true;242}243}244return true;245}246247bool Convert() {248errno = 0;249switch (dst_encoding) {250case US_ASCII:251buf[0] = wc;252dst_bytes_used = 1;253if (wc > 0x7f) errno = EILSEQ;254break;255case UTF_8:256dst_bytes_used = c32rtomb(buf, wc, &ps);257if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {258break; // EILSEQ already set.259} else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {260errno = EINVAL;261return false;262}263break;264case UTF_16_BE:265case UTF_16_LE: {266bool swap = (dst_encoding == UTF_16_BE);267if (wc < 0x10000) { // BMP.268Out16(buf, wc, swap);269} else { // Supplementary plane; output surrogate pair.270wc -= 0x10000;271char16_t hi = 0xd800 | (wc >> 10);272char16_t lo = 0xdc00 | (wc & 0x3ff);273Out16(buf + 0, hi, swap);274Out16(buf + 2, lo, swap);275dst_bytes_used = 4;276}277} break;278case UTF_32_BE:279case UTF_32_LE:280case WCHAR_T:281Out32(wc, (dst_encoding == UTF_32_BE));282break;283}284if (errno == EILSEQ) {285if (mode == IGNORE) {286*src_buf += src_bytes_used;287*src_bytes_left -= src_bytes_used;288ignored = true;289return true;290} else if (mode == TRANSLIT) {291wc = '?';292++replacement_count;293return Convert();294}295return false;296}297return Emit();298}299300uint16_t In16(const char* buf, bool swap) {301#ifdef __cplusplus302const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);303#else // !__cplusplus304const uint8_t* src = (const uint8_t*)(buf);305#endif // __cplusplus306uint16_t wc = (src[0]) | (src[1] << 8);307if (swap) wc = __swap16(wc);308src_bytes_used = 2;309return wc;310}311312uint32_t In32(const char* buf, bool swap) {313#ifdef __cplusplus314const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);315#else // !__cplusplus316const uint8_t* src = (const uint8_t*)(buf);317#endif // __cplusplus318uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);319if (swap) wc = __swap32(wc);320src_bytes_used = 4;321return wc;322}323324void Out16(char* dst, char16_t ch, bool swap) {325if (swap) ch = __swap16(ch);326dst[0] = ch;327dst[1] = ch >> 8;328dst_bytes_used = 2;329}330331void Out32(char32_t ch, bool swap) {332if (swap) ch = __swap32(ch);333buf[0] = ch;334buf[1] = ch >> 8;335buf[2] = ch >> 16;336buf[3] = ch >> 24;337dst_bytes_used = 4;338}339340bool Emit() {341if (dst_bytes_used > *dst_bytes_left) {342errno = E2BIG;343return false;344}345memcpy(*dst_buf, buf, dst_bytes_used);346*src_buf += src_bytes_used;347*src_bytes_left -= src_bytes_used;348*dst_buf += dst_bytes_used;349*dst_bytes_left -= dst_bytes_used;350return true;351}352353int Done() {354if (mode == TRANSLIT) return replacement_count;355if (ignored) {356errno = EILSEQ;357return -1;358}359return 0;360}361};362363iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {364iconv_t result = new __iconv_t;365if (!__parse_encoding(__src_encoding, &result->src_encoding, 0 /* nullptr */) ||366!__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {367delete result;368errno = EINVAL;369return INVALID_ICONV_T;370}371return result;372}373374size_t iconv(iconv_t __converter,375char** __src_buf, size_t* __src_bytes_left,376char** __dst_buf, size_t* __dst_bytes_left) {377if (__converter == INVALID_ICONV_T) {378errno = EBADF;379return -1;380}381return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);382}383384int iconv_close(iconv_t __converter) {385if (__converter == INVALID_ICONV_T) {386errno = EBADF;387return -1;388}389delete __converter;390return 0;391}392393#endif // __ANDROID__394395396