Path: blob/main/crypto/krb5/src/util/support/utf8.c
34889 views
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */1/* util/support/utf8.c */2/*3* Copyright 2008 by the Massachusetts Institute of Technology.4* All Rights Reserved.5*6* Export of this software from the United States of America may7* require a specific license from the United States Government.8* It is the responsibility of any person or organization contemplating9* export to obtain such a license before exporting.10*11* WITHIN THAT CONSTRAINT, permission to use, copy, modify, and12* distribute this software and its documentation for any purpose and13* without fee is hereby granted, provided that the above copyright14* notice appear in all copies and that both that copyright notice and15* this permission notice appear in supporting documentation, and that16* the name of M.I.T. not be used in advertising or publicity pertaining17* to distribution of the software without specific, written prior18* permission. Furthermore if you modify this software you must label19* your software as modified software and not distribute it in such a20* fashion that it might be confused with the original M.I.T. software.21* M.I.T. makes no representations about the suitability of22* this software for any purpose. It is provided "as is" without express23* or implied warranty.24*/25/*26* Copyright 1998-2008 The OpenLDAP Foundation.27* All rights reserved.28*29* Redistribution and use in source and binary forms, with or without30* modification, are permitted only as authorized by the OpenLDAP31* Public License.32*33* A copy of this license is available in the file LICENSE in the34* top-level directory of the distribution or, alternatively, at35* <https://www.OpenLDAP.org/license.html>.36*/3738/* This work is part of OpenLDAP Software <https://www.openldap.org/>. */3940/* Basic UTF-8 routines41*42* These routines are "dumb". Though they understand UTF-8,43* they don't grok Unicode. That is, they can push bits,44* but don't have a clue what the bits represent. That's45* good enough for use with the KRB5 Client SDK.46*47* These routines are not optimized.48*/4950#include "k5-platform.h"51#include "k5-utf8.h"52#include "supp-int.h"5354/*55* Returns length indicated by first byte.56*/57const char krb5int_utf8_lentab[] = {580, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,590, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,610, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,620, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,632, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,643, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,654, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };6667/*68* Make sure the UTF-8 char used the shortest possible encoding69* returns charlen if valid, 0 if not.70*71* Here are the valid UTF-8 encodings, taken from RFC 3629 page 4.72* The table is slightly modified from that of the RFC.73*74* UCS-4 range (hex) UTF-8 sequence (binary)75* 0000 0000-0000 007F 0.......76* 0000 0080-0000 07FF 110++++. 10......77* 0000 0800-0000 FFFF 1110++++ 10+..... 10......78* 0001 0000-0010 FFFF 11110+++ 10++.... 10...... 10......79*80* The '.' bits are "don't cares". When validating a UTF-8 sequence,81* at least one of the '+' bits must be set, otherwise the character82* should have been encoded in fewer octets. Note that in the two-octet83* case, only the first octet needs to be validated, and this is done84* in the krb5int_utf8_lentab[] above.85*/8687/* mask of required bits in second octet */88#undef c89#define c const char90c krb5int_utf8_mintab[] = {91(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,92(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,93(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x00, (c)0x00, (c)0x00,94(c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 };95#undef c9697/*98* Convert a UTF8 character to a UCS4 character. Return 0 on success,99* -1 on failure.100*/101int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)102{103const unsigned char *c = (const unsigned char *) p;104krb5_ucs4 ch;105int len, i;106static unsigned char mask[] = {1070, 0x7f, 0x1f, 0x0f, 0x07 };108109*out = 0;110len = KRB5_UTF8_CHARLEN2(p, len);111112if (len == 0)113return -1;114115ch = c[0] & mask[len];116117for (i = 1; i < len; i++) {118if ((c[i] & 0xc0) != 0x80)119return -1;120121ch <<= 6;122ch |= c[i] & 0x3f;123}124125if (ch > 0x10ffff)126return -1;127128*out = ch;129return 0;130}131132/* conv UCS-4 to UTF-8 */133size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)134{135size_t len = 0;136unsigned char *p = (unsigned char *) buf;137138/* not a valid Unicode character */139if (c > 0x10ffff)140return 0;141142/* Just return length, don't convert */143if (buf == NULL) {144if (c < 0x80) return 1;145else if (c < 0x800) return 2;146else if (c < 0x10000) return 3;147else return 4;148}149150if (c < 0x80) {151p[len++] = c;152} else if (c < 0x800) {153p[len++] = 0xc0 | ( c >> 6 );154p[len++] = 0x80 | ( c & 0x3f );155} else if (c < 0x10000) {156p[len++] = 0xe0 | ( c >> 12 );157p[len++] = 0x80 | ( (c >> 6) & 0x3f );158p[len++] = 0x80 | ( c & 0x3f );159} else /* if (c < 0x110000) */ {160p[len++] = 0xf0 | ( c >> 18 );161p[len++] = 0x80 | ( (c >> 12) & 0x3f );162p[len++] = 0x80 | ( (c >> 6) & 0x3f );163p[len++] = 0x80 | ( c & 0x3f );164}165166return len;167}168169170