Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/npt/utf.c
38769 views
/*1* Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/* Misc functions for conversion of Unicode and UTF-8 and platform encoding */2627#include <stdio.h>28#include <stddef.h>29#include <stdlib.h>30#include <stdarg.h>31#include <string.h>32#include <ctype.h>3334#include "jni.h"3536#include "utf.h"3738/*39* Error handler40*/41void42utfError(char *file, int line, char *message)43{44(void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message);45abort();46}4748/*49* Convert UTF-8 to UTF-1650* Returns length or -1 if output overflows.51*/52int JNICALL53utf8ToUtf16(struct UtfInst *ui, jbyte *utf8, int len, unsigned short *output, int outputMaxLen)54{55int outputLen;56int i;5758UTF_ASSERT(utf8);59UTF_ASSERT(len>=0);60UTF_ASSERT(output);61UTF_ASSERT(outputMaxLen>0);6263i = 0;64outputLen = 0;65while ( i<len ) {66unsigned code, x, y, z;6768if ( outputLen >= outputMaxLen ) {69return -1;70}71x = (unsigned char)utf8[i++];72code = x;73if ( (x & 0xE0)==0xE0 ) {74y = (unsigned char)utf8[i++];75z = (unsigned char)utf8[i++];76code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F);77} else if ( (x & 0xC0)==0xC0 ) {78y = (unsigned char)utf8[i++];79code = ((x & 0x1F)<<6) + (y & 0x3F);80}81output[outputLen++] = code;82}83return outputLen;84}8586/*87* Convert UTF-16 to UTF-8 Modified88* Returns length or -1 if output overflows.89*/90int JNICALL91utf16ToUtf8m(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)92{93int i;94int outputLen;9596UTF_ASSERT(utf16);97UTF_ASSERT(len>=0);98UTF_ASSERT(output);99UTF_ASSERT(outputMaxLen>0);100101outputLen = 0;102for (i = 0; i < len; i++) {103unsigned code;104105code = utf16[i];106if ( code >= 0x0001 && code <= 0x007F ) {107if ( outputLen + 1 >= outputMaxLen ) {108return -1;109}110output[outputLen++] = code;111} else if ( code == 0 || ( code >= 0x0080 && code <= 0x07FF ) ) {112if ( outputLen + 2 >= outputMaxLen ) {113return -1;114}115output[outputLen++] = ((code>>6) & 0x1F) | 0xC0;116output[outputLen++] = (code & 0x3F) | 0x80;117} else if ( code >= 0x0800 && code <= 0xFFFF ) {118if ( outputLen + 3 >= outputMaxLen ) {119return -1;120}121output[outputLen++] = ((code>>12) & 0x0F) | 0xE0;122output[outputLen++] = ((code>>6) & 0x3F) | 0x80;123output[outputLen++] = (code & 0x3F) | 0x80;124}125}126output[outputLen] = 0;127return outputLen;128}129130int JNICALL131utf16ToUtf8s(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)132{133return -1; /* FIXUP */134}135136/* Determine length of this Standard UTF-8 in Modified UTF-8.137* Validation is done of the basic UTF encoding rules, returns138* length (no change) when errors are detected in the UTF encoding.139*140* Note: Accepts Modified UTF-8 also, no verification on the141* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.142*/143int JNICALL144utf8sToUtf8mLength(struct UtfInst *ui, jbyte *string, int length)145{146int newLength;147int i;148149newLength = 0;150for ( i = 0 ; i < length ; i++ ) {151unsigned byte;152153byte = (unsigned char)string[i];154if ( (byte & 0x80) == 0 ) { /* 1byte encoding */155newLength++;156if ( byte == 0 ) {157newLength++; /* We gain one byte in length on NULL bytes */158}159} else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */160/* Check encoding of following bytes */161if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {162break; /* Error condition */163}164i++; /* Skip next byte */165newLength += 2;166} else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */167/* Check encoding of following bytes */168if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80169|| (string[i+2] & 0xC0) != 0x80 ) {170break; /* Error condition */171}172i += 2; /* Skip next two bytes */173newLength += 3;174} else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */175/* Check encoding of following bytes */176if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80177|| (string[i+2] & 0xC0) != 0x80178|| (string[i+3] & 0xC0) != 0x80 ) {179break; /* Error condition */180}181i += 3; /* Skip next 3 bytes */182newLength += 6; /* 4byte encoding turns into 2 3byte ones */183} else {184break; /* Error condition */185}186}187if ( i != length ) {188/* Error in finding new length, return old length so no conversion */189/* FIXUP: ERROR_MESSAGE? */190return length;191}192return newLength;193}194195/* Convert Standard UTF-8 to Modified UTF-8.196* Assumes the UTF-8 encoding was validated by utf8mLength() above.197*198* Note: Accepts Modified UTF-8 also, no verification on the199* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.200*/201void JNICALL202utf8sToUtf8m(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)203{204int i;205int j;206207j = 0;208for ( i = 0 ; i < length ; i++ ) {209unsigned byte1;210211byte1 = (unsigned char)string[i];212213/* NULL bytes and bytes starting with 11110xxx are special */214if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */215if ( byte1 == 0 ) {216/* Bits out: 11000000 10000000 */217newString[j++] = (jbyte)0xC0;218newString[j++] = (jbyte)0x80;219} else {220/* Single byte */221newString[j++] = byte1;222}223} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */224newString[j++] = byte1;225newString[j++] = string[++i];226} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */227newString[j++] = byte1;228newString[j++] = string[++i];229newString[j++] = string[++i];230} else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */231/* Beginning of 4byte encoding, turn into 2 3byte encodings */232unsigned byte2, byte3, byte4, u21;233234/* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */235byte2 = (unsigned char)string[++i];236byte3 = (unsigned char)string[++i];237byte4 = (unsigned char)string[++i];238/* Reconstruct full 21bit value */239u21 = (byte1 & 0x07) << 18;240u21 += (byte2 & 0x3F) << 12;241u21 += (byte3 & 0x3F) << 6;242u21 += (byte4 & 0x3F);243/* Bits out: 11101101 1010xxxx 10xxxxxx */244newString[j++] = (jbyte)0xED;245newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F));246newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F));247/* Bits out: 11101101 1011xxxx 10xxxxxx */248newString[j++] = (jbyte)0xED;249newString[j++] = (jbyte)(0xB0 + ((u21 >> 6) & 0x0F));250newString[j++] = byte4;251}252}253UTF_ASSERT(i==length);254UTF_ASSERT(j==newLength);255newString[j] = (jbyte)0;256}257258/* Given a Modified UTF-8 string, calculate the Standard UTF-8 length.259* Basic validation of the UTF encoding rules is done, and length is260* returned (no change) when errors are detected.261*262* Note: No validation is made that this is indeed Modified UTF-8 coming in.263*264*/265int JNICALL266utf8mToUtf8sLength(struct UtfInst *ui, jbyte *string, int length)267{268int newLength;269int i;270271newLength = 0;272for ( i = 0 ; i < length ; i++ ) {273unsigned byte1, byte2, byte3, byte4, byte5, byte6;274275byte1 = (unsigned char)string[i];276if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */277newLength++;278} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */279/* Check encoding of following bytes */280if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {281break; /* Error condition */282}283byte2 = (unsigned char)string[++i];284if ( byte1 != 0xC0 || byte2 != 0x80 ) {285newLength += 2; /* Normal 2byte encoding, not 0xC080 */286} else {287newLength++; /* We will turn 0xC080 into 0 */288}289} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */290/* Check encoding of following bytes */291if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80292|| (string[i+2] & 0xC0) != 0x80 ) {293break; /* Error condition */294}295byte2 = (unsigned char)string[++i];296byte3 = (unsigned char)string[++i];297newLength += 3;298/* Possible process a second 3byte encoding */299if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {300/* See if this is a pair of 3byte encodings */301byte4 = (unsigned char)string[i+1];302byte5 = (unsigned char)string[i+2];303byte6 = (unsigned char)string[i+3];304if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {305/* Check encoding of 3rd byte */306if ( (byte6 & 0xC0) != 0x80 ) {307break; /* Error condition */308}309newLength++; /* New string will have 4byte encoding */310i += 3; /* Skip next 3 bytes */311}312}313} else {314break; /* Error condition */315}316}317if ( i != length ) {318/* Error in UTF encoding */319/* FIXUP: ERROR_MESSAGE()? */320return length;321}322return newLength;323}324325/* Convert a Modified UTF-8 string into a Standard UTF-8 string326* It is assumed that this string has been validated in terms of the327* basic UTF encoding rules by utf8Length() above.328*329* Note: No validation is made that this is indeed Modified UTF-8 coming in.330*331*/332void JNICALL333utf8mToUtf8s(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)334{335int i;336int j;337338j = 0;339for ( i = 0 ; i < length ; i++ ) {340unsigned byte1, byte2, byte3, byte4, byte5, byte6;341342byte1 = (unsigned char)string[i];343if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */344/* Single byte */345newString[j++] = byte1;346} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */347byte2 = (unsigned char)string[++i];348if ( byte1 != 0xC0 || byte2 != 0x80 ) {349newString[j++] = byte1;350newString[j++] = byte2;351} else {352newString[j++] = 0;353}354} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */355byte2 = (unsigned char)string[++i];356byte3 = (unsigned char)string[++i];357if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {358/* See if this is a pair of 3byte encodings */359byte4 = (unsigned char)string[i+1];360byte5 = (unsigned char)string[i+2];361byte6 = (unsigned char)string[i+3];362if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {363unsigned u21;364365/* Bits in: 11101101 1010xxxx 10xxxxxx */366/* Bits in: 11101101 1011xxxx 10xxxxxx */367i += 3;368369/* Reconstruct 21 bit code */370u21 = ((byte2 & 0x0F) + 1) << 16;371u21 += (byte3 & 0x3F) << 10;372u21 += (byte5 & 0x0F) << 6;373u21 += (byte6 & 0x3F);374375/* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */376377/* Convert to 4byte encoding */378newString[j++] = 0xF0 + ((u21 >> 18) & 0x07);379newString[j++] = 0x80 + ((u21 >> 12) & 0x3F);380newString[j++] = 0x80 + ((u21 >> 6) & 0x3F);381newString[j++] = 0x80 + (u21 & 0x3F);382continue;383}384}385/* Normal 3byte encoding */386newString[j++] = byte1;387newString[j++] = byte2;388newString[j++] = byte3;389}390}391UTF_ASSERT(i==length);392UTF_ASSERT(j==newLength);393newString[j] = 0;394}395396/* ================================================================= */397398#ifdef COMPILE_WITH_UTF_TEST /* Test program */399400/*401* Convert any byte array into a printable string.402* Returns length or -1 if output overflows.403*/404static int405bytesToPrintable(struct UtfInst *ui, char *bytes, int len, char *output, int outputMaxLen)406{407int outputLen;408int i;409410UTF_ASSERT(bytes);411UTF_ASSERT(len>=0);412UTF_ASSERT(output);413UTF_ASSERT(outputMaxLen>=0);414415outputLen = 0;416for ( i=0; i<len ; i++ ) {417unsigned byte;418419byte = bytes[i];420if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) {421if ( outputLen + 1 >= outputMaxLen ) {422return -1;423}424output[outputLen++] = (char)byte;425} else {426if ( outputLen + 4 >= outputMaxLen ) {427return -1;428}429(void)sprintf(output+outputLen,"\\x%02x",byte);430outputLen += 4;431}432}433output[outputLen] = 0;434return outputLen;435}436437static void438test(void)439{440static char *strings[] = {441"characters",442"abcdefghijklmnopqrstuvwxyz",443"0123456789",444"!@#$%^&*()_+=-{}[]:;",445NULL };446int i;447struct UtfInst *ui;448449ui = utfInitialize(NULL);450451i = 0;452while ( strings[i] != NULL ) {453char *str;454#define MAX 1024455char buf0[MAX];456char buf1[MAX];457char buf2[MAX];458unsigned short buf3[MAX];459int len1;460int len2;461int len3;462463str = strings[i];464465(void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024);466467len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024);468469UTF_ASSERT(len1==(int)strlen(str));470471len3 = utf8ToUtf16(ui, (jbyte*)buf1, len1, (jchar*)buf3, 1024);472473UTF_ASSERT(len3==len1);474475len1 = utf16ToUtf8m(ui, (jchar*)buf3, len3, (jbyte*)buf1, 1024);476477UTF_ASSERT(len1==len3);478UTF_ASSERT(strcmp(str, buf1) == 0);479480len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024);481482UTF_ASSERT(len2==len1);483UTF_ASSERT(strcmp(str, buf2) == 0);484485i++;486}487488utfTerminate(ui, NULL);489490}491492int493main(int argc, char **argv)494{495test();496return 0;497}498499#endif500501502