CoCalc -- Punycode.java

GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/net/idn/Punycode.java
⁴⁶⁹⁶³ views
1
/*
2
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3
 *
4
 * This code is free software; you can redistribute it and/or modify it
5
 * under the terms of the GNU General Public License version 2 only, as
6
 * published by the Free Software Foundation.  Oracle designates this
7
 * particular file as subject to the "Classpath" exception as provided
8
 * by Oracle in the LICENSE file that accompanied this code.
9
 *
10
 * This code is distributed in the hope that it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
 * version 2 for more details (a copy is included in the LICENSE file that
14
 * accompanied this code).
15
 *
16
 * You should have received a copy of the GNU General Public License version
17
 * 2 along with this work; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
 *
20
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
 * or visit www.oracle.com if you need additional information or have any
22
 * questions.
23
 */
24
/*
25
 *******************************************************************************
26
 * Copyright (C) 2003-2004, International Business Machines Corporation and    *
27
 * others. All Rights Reserved.                                                *
28
 *******************************************************************************
29
 */
30
//
31
// CHANGELOG
32
//      2005-05-19 Edward Wang
33
//          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
34
//          - move from package com.ibm.icu.text to package sun.net.idn
35
//          - use ParseException instead of StringPrepParseException
36
//      2007-08-14 Martin Buchholz
37
//          - remove redundant casts
38
//
39
package sun.net.idn;
40

41
import java.text.ParseException;
42
import sun.text.normalizer.UCharacter;
43
import sun.text.normalizer.UTF16;
44

45
/**
46
 * Ported code from ICU punycode.c
47
 * @author ram
48
 */
49

50
/* Package Private class */
51
public final class Punycode {
52

53
    /* Punycode parameters for Bootstring */
54
    private static final int BASE           = 36;
55
    private static final int TMIN           = 1;
56
    private static final int TMAX           = 26;
57
    private static final int SKEW           = 38;
58
    private static final int DAMP           = 700;
59
    private static final int INITIAL_BIAS   = 72;
60
    private static final int INITIAL_N      = 0x80;
61

62
    /* "Basic" Unicode/ASCII code points */
63
    private static final int HYPHEN         = 0x2d;
64
    private static final int DELIMITER      = HYPHEN;
65

66
    private static final int ZERO           = 0x30;
67
    private static final int NINE           = 0x39;
68

69
    private static final int SMALL_A        = 0x61;
70
    private static final int SMALL_Z        = 0x7a;
71

72
    private static final int CAPITAL_A      = 0x41;
73
    private static final int CAPITAL_Z      = 0x5a;
74

75
    //  TODO: eliminate the 256 limitation
76
    private static final int MAX_CP_COUNT   = 256;
77

78
    private static final int UINT_MAGIC     = 0x80000000;
79
    private static final long ULONG_MAGIC   = 0x8000000000000000L;
80

81
    private static int adaptBias(int delta, int length, boolean firstTime){
82
        if(firstTime){
83
            delta /=DAMP;
84
        }else{
85
            delta /=  2;
86
        }
87
        delta += delta/length;
88

89
        int count=0;
90
        for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
91
            delta/=(BASE-TMIN);
92
        }
93

94
        return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
95
    }
96

97
    /**
98
     * basicToDigit[] contains the numeric value of a basic code
99
     * point (for use in representing integers) in the range 0 to
100
     * BASE-1, or -1 if b is does not represent a value.
101
     */
102
    static final int[]    basicToDigit= new int[]{
103
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
104
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
105

106
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
107
        26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
108

109
        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
110
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
111

112
        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
113
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
114

115
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
116
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117

118
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
119
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120

121
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
122
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
123

124
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
125
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
126
    };
127

128
    private static char asciiCaseMap(char b, boolean uppercase) {
129
        if(uppercase) {
130
            if(SMALL_A<=b && b<=SMALL_Z) {
131
                b-=(SMALL_A-CAPITAL_A);
132
            }
133
        } else {
134
            if(CAPITAL_A<=b && b<=CAPITAL_Z) {
135
                b+=(SMALL_A-CAPITAL_A);
136
            }
137
        }
138
        return b;
139
    }
140

141
    /**
142
     * digitToBasic() returns the basic code point whose value
143
     * (when used for representing integers) is d, which must be in the
144
     * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
145
     * nonzero, in which case the uppercase form is used.
146
     */
147
    private static char digitToBasic(int digit, boolean uppercase) {
148
        /*  0..25 map to ASCII a..z or A..Z */
149
        /* 26..35 map to ASCII 0..9         */
150
        if(digit<26) {
151
            if(uppercase) {
152
                return (char)(CAPITAL_A+digit);
153
            } else {
154
                return (char)(SMALL_A+digit);
155
            }
156
        } else {
157
            return (char)((ZERO-26)+digit);
158
        }
159
    }
160
    /**
161
     * Converts Unicode to Punycode.
162
     * The input string must not contain single, unpaired surrogates.
163
     * The output will be represented as an array of ASCII code points.
164
     *
165
     * @param src
166
     * @param caseFlags
167
     * @return
168
     * @throws ParseException
169
     */
170
    public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
171

172
        int[] cpBuffer = new int[MAX_CP_COUNT];
173
        int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
174
        char c, c2;
175
        int srcLength = src.length();
176
        int destCapacity = MAX_CP_COUNT;
177
        char[] dest = new char[destCapacity];
178
        StringBuffer result = new StringBuffer();
179
        /*
180
         * Handle the basic code points and
181
         * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
182
         */
183
        srcCPCount=destLength=0;
184

185
        for(j=0; j<srcLength; ++j) {
186
            if(srcCPCount==MAX_CP_COUNT) {
187
                /* too many input code points */
188
                throw new ParseException("Too many input code points", -1);
189
            }
190
            c=src.charAt(j);
191
            if(isBasic(c)) {
192
                if(destLength<destCapacity) {
193
                    cpBuffer[srcCPCount++]=0;
194
                    dest[destLength]=
195
                        caseFlags!=null ?
196
                            asciiCaseMap(c, caseFlags[j]) :
197
                            c;
198
                }
199
                ++destLength;
200
            } else {
201
                n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
202
                if(!UTF16.isSurrogate(c)) {
203
                    n|=c;
204
                } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
205
                    ++j;
206

207
                    n|=UCharacter.getCodePoint(c, c2);
208
                } else {
209
                    /* error: unmatched surrogate */
210
                    throw new ParseException("Illegal char found", -1);
211
                }
212
                cpBuffer[srcCPCount++]=n;
213
            }
214
        }
215

216
        /* Finish the basic string - if it is not empty - with a delimiter. */
217
        basicLength=destLength;
218
        if(basicLength>0) {
219
            if(destLength<destCapacity) {
220
                dest[destLength]=DELIMITER;
221
            }
222
            ++destLength;
223
        }
224

225
        /*
226
         * handledCPCount is the number of code points that have been handled
227
         * basicLength is the number of basic code points
228
         * destLength is the number of chars that have been output
229
         */
230

231
        /* Initialize the state: */
232
        n=INITIAL_N;
233
        delta=0;
234
        bias=INITIAL_BIAS;
235

236
        /* Main encoding loop: */
237
        for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
238
            /*
239
             * All non-basic code points < n have been handled already.
240
             * Find the next larger one:
241
             */
242
            for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
243
                q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
244
                if(n<=q && q<m) {
245
                    m=q;
246
                }
247
            }
248

249
            /*
250
             * Increase delta enough to advance the decoder's
251
             * <n,i> state to <m,0>, but guard against overflow:
252
             */
253
            if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
254
                throw new RuntimeException("Internal program error");
255
            }
256
            delta+=(m-n)*(handledCPCount+1);
257
            n=m;
258

259
            /* Encode a sequence of same code points n */
260
            for(j=0; j<srcCPCount; ++j) {
261
                q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
262
                if(q<n) {
263
                    ++delta;
264
                } else if(q==n) {
265
                    /* Represent delta as a generalized variable-length integer: */
266
                    for(q=delta, k=BASE; /* no condition */; k+=BASE) {
267

268
                        /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
269

270
                        t=k-bias;
271
                        if(t<TMIN) {
272
                            t=TMIN;
273
                        } else if(t>TMAX) {
274
                            t=TMAX;
275
                        }
276
                        */
277

278
                        t=k-bias;
279
                        if(t<TMIN) {
280
                            t=TMIN;
281
                        } else if(k>=(bias+TMAX)) {
282
                            t=TMAX;
283
                        }
284

285
                        if(q<t) {
286
                            break;
287
                        }
288

289
                        if(destLength<destCapacity) {
290
                            dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
291
                        }
292
                        q=(q-t)/(BASE-t);
293
                    }
294

295
                    if(destLength<destCapacity) {
296
                        dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
297
                    }
298
                    bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
299
                    delta=0;
300
                    ++handledCPCount;
301
                }
302
            }
303

304
            ++delta;
305
            ++n;
306
        }
307

308
        return result.append(dest, 0, destLength);
309
    }
310

311
    private static boolean isBasic(int ch){
312
        return (ch < INITIAL_N);
313
    }
314

315
    private static boolean isBasicUpperCase(int ch){
316
        return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
317
    }
318

319
    private static boolean isSurrogate(int ch){
320
        return (((ch)&0xfffff800)==0xd800);
321
    }
322
    /**
323
     * Converts Punycode to Unicode.
324
     * The Unicode string will be at most as long as the Punycode string.
325
     *
326
     * @param src
327
     * @param caseFlags
328
     * @return
329
     * @throws ParseException
330
     */
331
    public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
332
                               throws ParseException{
333
        int srcLength = src.length();
334
        StringBuffer result = new StringBuffer();
335
        int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
336
                destCPCount, firstSupplementaryIndex, cpLength;
337
        char b;
338
        int destCapacity = MAX_CP_COUNT;
339
        char[] dest = new char[destCapacity];
340

341
        /*
342
         * Handle the basic code points:
343
         * Let basicLength be the number of input code points
344
         * before the last delimiter, or 0 if there is none,
345
         * then copy the first basicLength code points to the output.
346
         *
347
         * The two following loops iterate backward.
348
         */
349
        for(j=srcLength; j>0;) {
350
            if(src.charAt(--j)==DELIMITER) {
351
                break;
352
            }
353
        }
354
        destLength=basicLength=destCPCount=j;
355

356
        while(j>0) {
357
            b=src.charAt(--j);
358
            if(!isBasic(b)) {
359
                throw new ParseException("Illegal char found", -1);
360
            }
361

362
            if(j<destCapacity) {
363
                dest[j]= b;
364

365
                if(caseFlags!=null) {
366
                    caseFlags[j]=isBasicUpperCase(b);
367
                }
368
            }
369
        }
370

371
        /* Initialize the state: */
372
        n=INITIAL_N;
373
        i=0;
374
        bias=INITIAL_BIAS;
375
        firstSupplementaryIndex=1000000000;
376

377
        /*
378
         * Main decoding loop:
379
         * Start just after the last delimiter if any
380
         * basic code points were copied; start at the beginning otherwise.
381
         */
382
        for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
383
            /*
384
             * in is the index of the next character to be consumed, and
385
             * destCPCount is the number of code points in the output array.
386
             *
387
             * Decode a generalized variable-length integer into delta,
388
             * which gets added to i.  The overflow checking is easier
389
             * if we increase i as we go, then subtract off its starting
390
             * value at the end to obtain delta.
391
             */
392
            for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
393
                if(in>=srcLength) {
394
                    throw new ParseException("Illegal char found", -1);
395
                }
396

397
                digit=basicToDigit[(byte)src.charAt(in++)];
398
                if(digit<0) {
399
                    throw new ParseException("Invalid char found", -1);
400
                }
401
                if(digit>(0x7fffffff-i)/w) {
402
                    /* integer overflow */
403
                    throw new ParseException("Illegal char found", -1);
404
                }
405

406
                i+=digit*w;
407
                t=k-bias;
408
                if(t<TMIN) {
409
                    t=TMIN;
410
                } else if(k>=(bias+TMAX)) {
411
                    t=TMAX;
412
                }
413
                if(digit<t) {
414
                    break;
415
                }
416

417
                if(w>0x7fffffff/(BASE-t)) {
418
                    /* integer overflow */
419
                    throw new ParseException("Illegal char found", -1);
420
                }
421
                w*=BASE-t;
422
            }
423

424
            /*
425
             * Modification from sample code:
426
             * Increments destCPCount here,
427
             * where needed instead of in for() loop tail.
428
             */
429
            ++destCPCount;
430
            bias=adaptBias(i-oldi, destCPCount, (oldi==0));
431

432
            /*
433
             * i was supposed to wrap around from (incremented) destCPCount to 0,
434
             * incrementing n each time, so we'll fix that now:
435
             */
436
            if(i/destCPCount>(0x7fffffff-n)) {
437
                /* integer overflow */
438
                throw new ParseException("Illegal char found", -1);
439
            }
440

441
            n+=i/destCPCount;
442
            i%=destCPCount;
443
            /* not needed for Punycode: */
444
            /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
445

446
            if(n>0x10ffff || isSurrogate(n)) {
447
                /* Unicode code point overflow */
448
                throw new ParseException("Illegal char found", -1);
449
            }
450

451
            /* Insert n at position i of the output: */
452
            cpLength=UTF16.getCharCount(n);
453
            if((destLength+cpLength)<destCapacity) {
454
                int codeUnitIndex;
455

456
                /*
457
                 * Handle indexes when supplementary code points are present.
458
                 *
459
                 * In almost all cases, there will be only BMP code points before i
460
                 * and even in the entire string.
461
                 * This is handled with the same efficiency as with UTF-32.
462
                 *
463
                 * Only the rare cases with supplementary code points are handled
464
                 * more slowly - but not too bad since this is an insertion anyway.
465
                 */
466
                if(i<=firstSupplementaryIndex) {
467
                    codeUnitIndex=i;
468
                    if(cpLength>1) {
469
                        firstSupplementaryIndex=codeUnitIndex;
470
                    } else {
471
                        ++firstSupplementaryIndex;
472
                    }
473
                } else {
474
                    codeUnitIndex=firstSupplementaryIndex;
475
                    codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
476
                }
477

478
                /* use the UChar index codeUnitIndex instead of the code point index i */
479
                if(codeUnitIndex<destLength) {
480
                    System.arraycopy(dest, codeUnitIndex,
481
                                     dest, codeUnitIndex+cpLength,
482
                                    (destLength-codeUnitIndex));
483
                    if(caseFlags!=null) {
484
                        System.arraycopy(caseFlags, codeUnitIndex,
485
                                         caseFlags, codeUnitIndex+cpLength,
486
                                         destLength-codeUnitIndex);
487
                    }
488
                }
489
                if(cpLength==1) {
490
                    /* BMP, insert one code unit */
491
                    dest[codeUnitIndex]=(char)n;
492
                } else {
493
                    /* supplementary character, insert two code units */
494
                    dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
495
                    dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
496
                }
497
                if(caseFlags!=null) {
498
                    /* Case of last character determines uppercase flag: */
499
                    caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
500
                    if(cpLength==2) {
501
                        caseFlags[codeUnitIndex+1]=false;
502
                    }
503
                }
504
            }
505
            destLength+=cpLength;
506
            ++i;
507
        }
508
        result.append(dest, 0, destLength);
509
        return result;
510
    }
511
}
512

513
Product

Resources

Company