#ifdef TLF_FONTS1/*2* Copyright (c) 2007 Alexey Vatchenko <[email protected]>3*4* Permission to use, copy, modify, and/or distribute this software for any5* purpose with or without fee is hereby granted, provided that the above6* copyright notice and this permission notice appear in all copies.7*8* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES9* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF10* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR11* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES12* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN13* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF14* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.15*/16#include <sys/types.h>1718#include <wchar.h>19#include <arpa/inet.h> /* for htonl() */2021#include "utf8.h"2223#define _NXT 0x8024#define _SEQ2 0xc025#define _SEQ3 0xe026#define _SEQ4 0xf027#define _SEQ5 0xf828#define _SEQ6 0xfc2930#define _BOM 0xfeff3132static int __wchar_forbitten(wchar_t sym);33static int __utf8_forbitten(u_char octet);3435static int36__wchar_forbitten(wchar_t sym)37{3839/* Surrogate pairs */40if (sym >= 0xd800 && sym <= 0xdfff)41return (-1);4243return (0);44}4546static int47__utf8_forbitten(u_char octet)48{4950switch (octet) {51case 0xc0:52case 0xc1:53case 0xf5:54case 0xff:55return (-1);56}5758return (0);59}6061/*62* DESCRIPTION63* This function translates UTF-8 string into UCS-4 string (all symbols64* will be in local machine byte order).65*66* It takes the following arguments:67* in - input UTF-8 string. It can be null-terminated.68* insize - size of input string in bytes.69* out - result buffer for UCS-4 string. If out is NULL,70* function returns size of result buffer.71* outsize - size of out buffer in wide characters.72*73* RETURN VALUES74* The function returns size of result buffer (in wide characters).75* Zero is returned in case of error.76*77* CAVEATS78* 1. If UTF-8 string contains zero symbols, they will be translated79* as regular symbols.80* 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary81* when `out' is NULL and not NULL. It's because of special UTF-882* sequences which may result in forbitten (by RFC3629) UNICODE83* characters. So, the caller must check return value every time and84* not prepare buffer in advance (\0 terminate) but after calling this85* function.86*/87size_t88utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize,89int flags)90{91u_char *p, *lim;92wchar_t *wlim, high;93size_t n, total, i, n_bits;9495if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))96return (0);9798total = 0;99p = (u_char *)in;100lim = p + insize;101wlim = out + outsize;102103for (; p < lim; p += n) {104if (__utf8_forbitten(*p) != 0 &&105(flags & UTF8_IGNORE_ERROR) == 0)106return (0);107108/*109* Get number of bytes for one wide character.110*/111n = 1; /* default: 1 byte. Used when skipping bytes. */112if ((*p & 0x80) == 0)113high = (wchar_t)*p;114else if ((*p & 0xe0) == _SEQ2) {115n = 2;116high = (wchar_t)(*p & 0x1f);117} else if ((*p & 0xf0) == _SEQ3) {118n = 3;119high = (wchar_t)(*p & 0x0f);120} else if ((*p & 0xf8) == _SEQ4) {121n = 4;122high = (wchar_t)(*p & 0x07);123} else if ((*p & 0xfc) == _SEQ5) {124n = 5;125high = (wchar_t)(*p & 0x03);126} else if ((*p & 0xfe) == _SEQ6) {127n = 6;128high = (wchar_t)(*p & 0x01);129} else {130if ((flags & UTF8_IGNORE_ERROR) == 0)131return (0);132continue;133}134135/* does the sequence header tell us truth about length? */136if (lim - p <= n - 1) {137if ((flags & UTF8_IGNORE_ERROR) == 0)138return (0);139n = 1;140continue; /* skip */141}142143/*144* Validate sequence.145* All symbols must have higher bits set to 10xxxxxx146*/147if (n > 1) {148for (i = 1; i < n; i++) {149if ((p[i] & 0xc0) != _NXT)150break;151}152if (i != n) {153if ((flags & UTF8_IGNORE_ERROR) == 0)154return (0);155n = 1;156continue; /* skip */157}158}159160total++;161162if (out == NULL)163continue;164165if (out >= wlim)166return (0); /* no space left */167168*out = 0;169n_bits = 0;170for (i = 1; i < n; i++) {171*out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;172n_bits += 6; /* 6 low bits in every byte */173}174*out |= high << n_bits;175176if (*out == 0) /* return at end of string */177break;178179if (__wchar_forbitten(*out) != 0) {180if ((flags & UTF8_IGNORE_ERROR) == 0)181return (0); /* forbitten character */182else {183total--;184out--;185}186} else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {187total--;188out--;189}190191out++;192}193194return (total);195}196197/*198* DESCRIPTION199* This function translates UCS-4 symbols (given in local machine200* byte order) into UTF-8 string.201*202* It takes the following arguments:203* in - input unicode string. It can be null-terminated.204* insize - size of input string in wide characters.205* out - result buffer for utf8 string. If out is NULL,206* function returns size of result buffer.207* outsize - size of result buffer.208*209* RETURN VALUES210* The function returns size of result buffer (in bytes). Zero is returned211* in case of error.212*213* CAVEATS214* If UCS-4 string contains zero symbols, they will be translated215* as regular symbols.216*/217size_t218wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize,219int flags)220{221wchar_t *w, *wlim, ch;222u_char *p, *lim, *oc;223size_t total, n;224225if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))226return (0);227228w = (wchar_t *)in;229wlim = w + insize;230p = (u_char *)out;231lim = p + outsize;232total = 0;233for (; w < wlim; w++) {234if (__wchar_forbitten(*w) != 0) {235if ((flags & UTF8_IGNORE_ERROR) == 0)236return (0);237else238continue;239}240241if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)242continue;243244if (*w < 0) {245if ((flags & UTF8_IGNORE_ERROR) == 0)246return (0);247continue;248} else if (*w <= 0x0000007f)249n = 1;250else if (*w <= 0x000007ff)251n = 2;252else if (*w <= 0x0000ffff)253n = 3;254else if (*w <= 0x001fffff)255n = 4;256else if (*w <= 0x03ffffff)257n = 5;258else /* if (*w <= 0x7fffffff) */259n = 6;260261total += n;262263if (out == NULL)264continue;265266if (lim - p <= n - 1)267return (0); /* no space left */268269/* make it work under different endians */270ch = htonl(*w);271oc = (u_char *)&ch;272switch (n) {273case 1:274*p = oc[3];275break;276277case 2:278p[1] = _NXT | (oc[3] & 0x3f);279p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);280break;281282case 3:283p[2] = _NXT | (oc[3] & 0x3f);284p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);285p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);286break;287288case 4:289p[3] = _NXT | (oc[3] & 0x3f);290p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);291p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |292((oc[1] & 0x03) << 4);293p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);294break;295296case 5:297p[4] = _NXT | (oc[3] & 0x3f);298p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);299p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |300((oc[1] & 0x03) << 4);301p[1] = _NXT | (oc[1] >> 2);302p[0] = _SEQ5 | (oc[0] & 0x03);303break;304305case 6:306p[5] = _NXT | (oc[3] & 0x3f);307p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);308p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);309p[2] = _NXT | (oc[1] >> 2);310p[1] = _NXT | (oc[0] & 0x3f);311p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);312break;313}314315/*316* NOTE: do not check here for forbitten UTF-8 characters.317* They cannot appear here because we do proper convertion.318*/319320p += n;321}322323return (total);324}325#endif /* TLF_FONTS */326327328