CoCalc -- idna.c

GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmlibuv/src/idna.c
³¹⁵³ views
1
/* Copyright (c) 2011, 2018 Ben Noordhuis <[email protected]>
2
 *
3
 * Permission to use, copy, modify, and/or distribute this software for any
4
 * purpose with or without fee is hereby granted, provided that the above
5
 * copyright notice and this permission notice appear in all copies.
6
 *
7
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
 */
15

16
/* Derived from https://github.com/bnoordhuis/punycode
17
 * but updated to support IDNA 2008.
18
 */
19

20
#include "uv.h"
21
#include "idna.h"
22
#include <assert.h>
23
#include <string.h>
24
#include <limits.h> /* UINT_MAX */
25

26
static unsigned uv__utf8_decode1_slow(const char** p,
27
                                      const char* pe,
28
                                      unsigned a) {
29
  unsigned b;
30
  unsigned c;
31
  unsigned d;
32
  unsigned min;
33

34
  if (a > 0xF7)
35
    return -1;
36

37
  switch (pe - *p) {
38
  default:
39
    if (a > 0xEF) {
40
      min = 0x10000;
41
      a = a & 7;
42
      b = (unsigned char) *(*p)++;
43
      c = (unsigned char) *(*p)++;
44
      d = (unsigned char) *(*p)++;
45
      break;
46
    }
47
    /* Fall through. */
48
  case 2:
49
    if (a > 0xDF) {
50
      min = 0x800;
51
      b = 0x80 | (a & 15);
52
      c = (unsigned char) *(*p)++;
53
      d = (unsigned char) *(*p)++;
54
      a = 0;
55
      break;
56
    }
57
    /* Fall through. */
58
  case 1:
59
    if (a > 0xBF) {
60
      min = 0x80;
61
      b = 0x80;
62
      c = 0x80 | (a & 31);
63
      d = (unsigned char) *(*p)++;
64
      a = 0;
65
      break;
66
    }
67
    /* Fall through. */
68
  case 0:
69
    return -1;  /* Invalid continuation byte. */
70
  }
71

72
  if (0x80 != (0xC0 & (b ^ c ^ d)))
73
    return -1;  /* Invalid sequence. */
74

75
  b &= 63;
76
  c &= 63;
77
  d &= 63;
78
  a = (a << 18) | (b << 12) | (c << 6) | d;
79

80
  if (a < min)
81
    return -1;  /* Overlong sequence. */
82

83
  if (a > 0x10FFFF)
84
    return -1;  /* Four-byte sequence > U+10FFFF. */
85

86
  if (a >= 0xD800 && a <= 0xDFFF)
87
    return -1;  /* Surrogate pair. */
88

89
  return a;
90
}
91

92
unsigned uv__utf8_decode1(const char** p, const char* pe) {
93
  unsigned a;
94

95
  assert(*p < pe);
96

97
  a = (unsigned char) *(*p)++;
98

99
  if (a < 128)
100
    return a;  /* ASCII, common case. */
101

102
  return uv__utf8_decode1_slow(p, pe, a);
103
}
104

105
static int uv__idna_toascii_label(const char* s, const char* se,
106
                                  char** d, char* de) {
107
  static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
108
  const char* ss;
109
  unsigned c;
110
  unsigned h;
111
  unsigned k;
112
  unsigned n;
113
  unsigned m;
114
  unsigned q;
115
  unsigned t;
116
  unsigned x;
117
  unsigned y;
118
  unsigned bias;
119
  unsigned delta;
120
  unsigned todo;
121
  int first;
122

123
  h = 0;
124
  ss = s;
125
  todo = 0;
126

127
  /* Note: after this loop we've visited all UTF-8 characters and know
128
   * they're legal so we no longer need to check for decode errors.
129
   */
130
  while (s < se) {
131
    c = uv__utf8_decode1(&s, se);
132

133
    if (c == UINT_MAX)
134
      return UV_EINVAL;
135

136
    if (c < 128)
137
      h++;
138
    else
139
      todo++;
140
  }
141

142
  /* Only write "xn--" when there are non-ASCII characters. */
143
  if (todo > 0) {
144
    if (*d < de) *(*d)++ = 'x';
145
    if (*d < de) *(*d)++ = 'n';
146
    if (*d < de) *(*d)++ = '-';
147
    if (*d < de) *(*d)++ = '-';
148
  }
149

150
  /* Write ASCII characters. */
151
  x = 0;
152
  s = ss;
153
  while (s < se) {
154
    c = uv__utf8_decode1(&s, se);
155
    assert(c != UINT_MAX);
156

157
    if (c > 127)
158
      continue;
159

160
    if (*d < de)
161
      *(*d)++ = c;
162

163
    if (++x == h)
164
      break;  /* Visited all ASCII characters. */
165
  }
166

167
  if (todo == 0)
168
    return h;
169

170
  /* Only write separator when we've written ASCII characters first. */
171
  if (h > 0)
172
    if (*d < de)
173
      *(*d)++ = '-';
174

175
  n = 128;
176
  bias = 72;
177
  delta = 0;
178
  first = 1;
179

180
  while (todo > 0) {
181
    m = -1;
182
    s = ss;
183

184
    while (s < se) {
185
      c = uv__utf8_decode1(&s, se);
186
      assert(c != UINT_MAX);
187

188
      if (c >= n)
189
        if (c < m)
190
          m = c;
191
    }
192

193
    x = m - n;
194
    y = h + 1;
195

196
    if (x > ~delta / y)
197
      return UV_E2BIG;  /* Overflow. */
198

199
    delta += x * y;
200
    n = m;
201

202
    s = ss;
203
    while (s < se) {
204
      c = uv__utf8_decode1(&s, se);
205
      assert(c != UINT_MAX);
206

207
      if (c < n)
208
        if (++delta == 0)
209
          return UV_E2BIG;  /* Overflow. */
210

211
      if (c != n)
212
        continue;
213

214
      for (k = 36, q = delta; /* empty */; k += 36) {
215
        t = 1;
216

217
        if (k > bias)
218
          t = k - bias;
219

220
        if (t > 26)
221
          t = 26;
222

223
        if (q < t)
224
          break;
225

226
        /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
227
         * 10 <= y <= 35, we can optimize the long division
228
         * into a table-based reciprocal multiplication.
229
         */
230
        x = q - t;
231
        y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
232
        q = x / y;
233
        t = t + x % y;  /* 1 <= t <= 35 because of y. */
234

235
        if (*d < de)
236
          *(*d)++ = alphabet[t];
237
      }
238

239
      if (*d < de)
240
        *(*d)++ = alphabet[q];
241

242
      delta /= 2;
243

244
      if (first) {
245
        delta /= 350;
246
        first = 0;
247
      }
248

249
      /* No overflow check is needed because |delta| was just
250
       * divided by 2 and |delta+delta >= delta + delta/h|.
251
       */
252
      h++;
253
      delta += delta / h;
254

255
      for (bias = 0; delta > 35 * 26 / 2; bias += 36)
256
        delta /= 35;
257

258
      bias += 36 * delta / (delta + 38);
259
      delta = 0;
260
      todo--;
261
    }
262

263
    delta++;
264
    n++;
265
  }
266

267
  return 0;
268
}
269

270
long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
271
  const char* si;
272
  const char* st;
273
  unsigned c;
274
  char* ds;
275
  int rc;
276

277
  if (s == se)
278
    return UV_EINVAL;
279

280
  ds = d;
281

282
  si = s;
283
  while (si < se) {
284
    st = si;
285
    c = uv__utf8_decode1(&si, se);
286

287
    if (c == UINT_MAX)
288
      return UV_EINVAL;
289

290
    if (c != '.')
291
      if (c != 0x3002)  /* 。 */
292
        if (c != 0xFF0E)  /* ． */
293
          if (c != 0xFF61)  /* ｡ */
294
            continue;
295

296
    rc = uv__idna_toascii_label(s, st, &d, de);
297

298
    if (rc < 0)
299
      return rc;
300

301
    if (d < de)
302
      *d++ = '.';
303

304
    s = si;
305
  }
306

307
  if (s < se) {
308
    rc = uv__idna_toascii_label(s, se, &d, de);
309

310
    if (rc < 0)
311
      return rc;
312
  }
313

314
  if (d >= de)
315
    return UV_EINVAL;
316

317
  *d++ = '\0';
318
  return d - ds;  /* Number of bytes written. */
319
}
320

321
Product

Resources

Company