Path: blob/main/crypto/krb5/src/util/support/t_utf8.c
34889 views
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */1/* util/support/t_utf8.c - test UTF-8 boundary conditions */2/*3* Copyright (C) 2015 by the Massachusetts Institute of Technology.4* All rights reserved.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9*10* * Redistributions of source code must retain the above copyright11* notice, this list of conditions and the following disclaimer.12*13* * Redistributions in binary form must reproduce the above copyright14* notice, this list of conditions and the following disclaimer in15* the documentation and/or other materials provided with the16* distribution.17*18* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS19* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT20* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS21* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE22* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,23* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES24* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR25* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)26* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,27* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)28* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED29* OF THE POSSIBILITY OF SUCH DAMAGE.30*/3132#include <stdio.h>33#include <string.h>3435#include "k5-platform.h"36#include "k5-utf8.h"3738/*39* Convenience macro to allow testing of old encodings.40*41* "Old" means ISO/IEC 10646 prior to 2011, when the highest valid code point42* was U+7FFFFFFF instead of U+10FFFF.43*/44#ifdef OLDENCODINGS45#define L(x) (x)46#else47#define L(x) 048#endif4950/*51* len is 0 for invalid encoding prefixes (KRB5_UTF8_CHARLEN2() partially52* enforces the validity of the first two bytes, based on masking the second53* byte. It doesn't check whether bit 6 is 0, though, and doesn't catch the54* range between U+110000 and U+13FFFF).55*56* ucs is 0 for invalid encodings (including ones with valid prefixes according57* to KRB5_UTF8_CHARLEN2(); krb5int_utf8_to_ucs4() will still fail on them58* because it checks more things.) Code points above U+10FFFF are excluded by59* the actual test code and remain in the table for possibly testing the old60* implementation that didn't exclude them.61*62* Neither krb5int_ucs4_to_utf8() nor krb5int_utf8_to_ucs4() excludes the63* surrogate pair range.64*/65struct testcase {66const char *p;67krb5_ucs4 ucs;68int len;69} testcases[] = {70{ "\x7f", 0x0000007f, 1 }, /* Lowest 1-byte encoding */71{ "\xc0\x80", 0x00000000, 0 }, /* Invalid 2-byte encoding */72{ "\xc2\x80", 0x00000080, 2 }, /* Lowest valid 2-byte encoding */73{ "\xdf\xbf", 0x000007ff, 2 }, /* Highest valid 2-byte encoding*/74{ "\xdf\xff", 0x00000000, 2 }, /* Invalid 2-byte encoding*/75{ "\xe0\x80\x80", 0x00000000, 0 }, /* Invalid 3-byte encoding */76{ "\xe0\xa0\x80", 0x00000800, 3 }, /* Lowest valid 3-byte encoding */77{ "\xef\xbf\xbf", 0x0000ffff, 3 }, /* Highest valid 3-byte encoding */78{ "\xef\xff\xff", 0x00000000, 3 }, /* Invalid 3-byte encoding */79{ "\xf0\x80\x80\x80", 0x00000000, 0 }, /* Invalid 4-byte encoding */80{ "\xf0\x90\x80\x80", 0x00010000, 4 }, /* Lowest valid 4-byte encoding */81{ "\xf4\x8f\xbf\xbf", 0x0010ffff, 4 }, /* Highest valid 4-byte encoding */82/* Next higher 4-byte encoding (old) */83{ "\xf4\x90\x80\x80", 0x00110000, 4 },84/* Highest 4-byte encoding starting with 0xf4 (old) */85{ "\xf4\xbf\xbf\xbf", 0x0013ffff, 4 },86/* Next higher 4-byte prefix byte (old) */87{ "\xf5\x80\x80\x80", 0x00140000, L(4) },88/* Highest valid 4-byte encoding (old) */89{ "\xf7\xbf\xbf\xbf", 0x001fffff, L(4) },90/* Invalid 4-byte encoding */91{ "\xf7\xff\xff\xff", 0x00000000, L(4) },92/* Invalid 5-byte encoding */93{ "\xf8\x80\x80\x80\x80", 0x00000000, 0 },94/* Lowest valid 5-byte encoding (old) */95{ "\xf8\x88\x80\x80\x80", 0x00200000, L(5) },96/* Highest valid 5-byte encoding (old) */97{ "\xfb\xbf\xbf\xbf\xbf", 0x03ffffff, L(5) },98/* Invalid 5-byte encoding */99{ "\xfb\xff\xff\xff\xff", 0x00000000, L(5) },100/* Invalid 6-byte encoding */101{ "\xfc\x80\x80\x80\x80\x80", 0x00000000, 0 },102/* Lowest valid 6-byte encoding (old) */103{ "\xfc\x84\x80\x80\x80\x80", 0x04000000, L(6) },104/* Highest valid 6-byte encoding (old) */105{ "\xfd\xbf\xbf\xbf\xbf\xbf", 0x7fffffff, L(6) },106/* Invalid 6-byte encoding */107{ "\xfd\xff\xff\xff\xff\xff", 0x00000000, L(6) },108};109110static void111printhex(const char *p)112{113for (; *p != '\0'; p++) {114printf("%02x ", (unsigned char)*p);115}116}117118static void119printtest(struct testcase *t)120{121printhex(t->p);122printf("0x%08lx, %d\n", (unsigned long)t->ucs, t->len);123}124125static int126test_decode(struct testcase *t, int high4)127{128int len, status = 0;129krb5_ucs4 u = 0;130131len = KRB5_UTF8_CHARLEN2(t->p, len);132if (len != t->len) {133printf("expected len=%d, got len=%d\n", t->len, len);134status = 1;135}136if ((t->len == 0 || high4) && krb5int_utf8_to_ucs4(t->p, &u) != -1) {137printf("unexpected success in utf8_to_ucs4\n");138status = 1;139}140if (krb5int_utf8_to_ucs4(t->p, &u) != 0 && t->ucs != 0 && !high4) {141printf("unexpected failure in utf8_to_ucs4\n");142status = 1;143}144if (t->ucs != u && !high4) {145printf("expected 0x%08lx, got 0x%08lx\n", (unsigned long)t->ucs,146(unsigned long)u);147status = 1;148}149return status;150}151152static int153test_encode(struct testcase *t, int high4)154{155size_t size;156char buf[7];157158memset(buf, 0, sizeof(buf));159size = krb5int_ucs4_to_utf8(t->ucs, buf);160if (high4 && size != 0) {161printf("unexpected success beyond U+10FFFF\n");162return 1;163}164if (!high4 && size == 0) {165printf("unexpected zero size on encode\n");166return 1;167}168if (size != 0 && strcmp(t->p, buf) != 0) {169printf("expected ");170printhex(t->p);171printf("got ");172printhex(buf);173printf("\n");174return 1;175}176return 0;177}178179int180main(int argc, char **argv)181{182size_t ncases = sizeof(testcases) / sizeof(testcases[0]);183size_t i;184struct testcase *t;185int status = 0, verbose = 0;186/* Is this a "high" 4-byte encoding above U+10FFFF? */187int high4;188189if (argc == 2 && strcmp(argv[1], "-v") == 0)190verbose = 1;191for (i = 0; i < ncases; i++) {192t = &testcases[i];193if (verbose)194printtest(t);195#ifndef OLDENCODINGS196high4 = t->ucs > 0x10ffff;197#else198high4 = 0;199#endif200if (test_decode(t, high4) != 0)201status = 1;202if (t->ucs == 0)203continue;204if (test_encode(t, high4) != 0)205status = 1;206}207return status;208}209210211