CoCalc -- StringPrep.java

GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/net/idn/StringPrep.java
⁴⁷⁰⁸⁶ views
1
/*
2
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3
 *
4
 * This code is free software; you can redistribute it and/or modify it
5
 * under the terms of the GNU General Public License version 2 only, as
6
 * published by the Free Software Foundation.  Oracle designates this
7
 * particular file as subject to the "Classpath" exception as provided
8
 * by Oracle in the LICENSE file that accompanied this code.
9
 *
10
 * This code is distributed in the hope that it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
 * version 2 for more details (a copy is included in the LICENSE file that
14
 * accompanied this code).
15
 *
16
 * You should have received a copy of the GNU General Public License version
17
 * 2 along with this work; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
 *
20
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
 * or visit www.oracle.com if you need additional information or have any
22
 * questions.
23
 */
24
/*
25
/*
26
 *******************************************************************************
27
 * Copyright (C) 2003-2004, International Business Machines Corporation and         *
28
 * others. All Rights Reserved.                                                *
29
 *******************************************************************************
30
 */
31
//
32
// CHANGELOG
33
//      2005-05-19 Edward Wang
34
//          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
35
//          - move from package com.ibm.icu.text to package sun.net.idn
36
//          - use ParseException instead of StringPrepParseException
37
//          - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
38
//          - remove all @deprecated tag to make compiler happy
39
//      2007-08-14 Martin Buchholz
40
//          - remove redundant casts
41
//
42
package sun.net.idn;
43

44
import java.io.BufferedInputStream;
45
import java.io.ByteArrayInputStream;
46
import java.io.IOException;
47
import java.io.InputStream;
48
import java.text.ParseException;
49

50
import sun.text.Normalizer;
51
import sun.text.normalizer.CharTrie;
52
import sun.text.normalizer.Trie;
53
import sun.text.normalizer.NormalizerImpl;
54
import sun.text.normalizer.VersionInfo;
55
import sun.text.normalizer.UCharacter;
56
import sun.text.normalizer.UCharacterIterator;
57
import sun.text.normalizer.UTF16;
58
import sun.net.idn.UCharacterDirection;
59
import sun.net.idn.StringPrepDataReader;
60

61
/**
62
 * StringPrep API implements the StingPrep framework as described by
63
 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
64
 * StringPrep prepares Unicode strings for use in network protocols.
65
 * Profiles of StingPrep are set of rules and data according to which the
66
 * Unicode Strings are prepared. Each profiles contains tables which describe
67
 * how a code point should be treated. The tables are broadly classied into
68
 * <ul>
69
 *     <li> Unassigned Table: Contains code points that are unassigned
70
 *          in the Unicode Version supported by StringPrep. Currently
71
 *          RFC 3454 supports Unicode 3.2. </li>
72
 *     <li> Prohibited Table: Contains code points that are prohibted from
73
 *          the output of the StringPrep processing function. </li>
74
 *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
75
 * </ul>
76
 *
77
 * The procedure for preparing Unicode strings:
78
 * <ol>
79
 *      <li> Map: For each character in the input, check if it has a mapping
80
 *           and, if so, replace it with its mapping. </li>
81
 *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
82
 *           normalization. </li>
83
 *      <li> Prohibit: Check for any characters that are not allowed in the
84
 *           output.  If any are found, return an error.</li>
85
 *      <li> Check bidi: Possibly check for right-to-left characters, and if
86
 *           any are found, make sure that the whole string satisfies the
87
 *           requirements for bidirectional strings.  If the string does not
88
 *           satisfy the requirements for bidirectional strings, return an
89
 *           error.  </li>
90
 * </ol>
91
 * @author Ram Viswanadha
92
 * @draft ICU 2.8
93
 */
94
public final class StringPrep {
95
    /**
96
     * Option to prohibit processing of unassigned code points in the input
97
     *
98
     * @see   #prepare
99
     * @draft ICU 2.8
100
     */
101
    public static final int DEFAULT = 0x0000;
102

103
    /**
104
     * Option to allow processing of unassigned code points in the input
105
     *
106
     * @see   #prepare
107
     * @draft ICU 2.8
108
     */
109
    public static final int ALLOW_UNASSIGNED = 0x0001;
110

111
    private static final int UNASSIGNED        = 0x0000;
112
    private static final int MAP               = 0x0001;
113
    private static final int PROHIBITED        = 0x0002;
114
    private static final int DELETE            = 0x0003;
115
    private static final int TYPE_LIMIT        = 0x0004;
116

117
    private static final int NORMALIZATION_ON  = 0x0001;
118
    private static final int CHECK_BIDI_ON     = 0x0002;
119

120
    private static final int TYPE_THRESHOLD       = 0xFFF0;
121
    private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
122
    private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
123

124
    /* indexes[] value names */
125
    private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
126
    private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
127
    private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
128
    private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
129
    private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
130
    private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
131
    private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
132
    private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
133
    private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
134

135

136
    /**
137
     * Default buffer size of datafile
138
     */
139
    private static final int DATA_BUFFER_SIZE = 25000;
140

141
    /* Wrappers for Trie implementations */
142
    private static final class StringPrepTrieImpl implements Trie.DataManipulate{
143
        private CharTrie sprepTrie = null;
144
       /**
145
        * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
146
        * data the index array offset of the indexes for that lead surrogate.
147
        * @param property data value for a surrogate from the trie, including
148
        *        the folding offset
149
        * @return data offset or 0 if there is no data for the lead surrogate
150
        */
151
         public int getFoldingOffset(int value){
152
            return value;
153
        }
154
    }
155

156
    // CharTrie implementation for reading the trie data
157
    private StringPrepTrieImpl sprepTrieImpl;
158
    // Indexes read from the data file
159
    private int[] indexes;
160
    // mapping data read from the data file
161
    private char[] mappingData;
162
    // format version of the data file
163
    private byte[] formatVersion;
164
    // the version of Unicode supported by the data file
165
    private VersionInfo sprepUniVer;
166
    // the Unicode version of last entry in the
167
    // NormalizationCorrections.txt file if normalization
168
    // is turned on
169
    private VersionInfo normCorrVer;
170
    // Option to turn on Normalization
171
    private boolean doNFKC;
172
    // Option to turn on checking for BiDi rules
173
    private boolean checkBiDi;
174

175

176
    private char getCodePointValue(int ch){
177
        return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
178
    }
179

180
    private static VersionInfo getVersionInfo(int comp){
181
        int micro = comp & 0xFF;
182
        int milli =(comp >> 8)  & 0xFF;
183
        int minor =(comp >> 16) & 0xFF;
184
        int major =(comp >> 24) & 0xFF;
185
        return VersionInfo.getInstance(major,minor,milli,micro);
186
    }
187
    private static VersionInfo getVersionInfo(byte[] version){
188
        if(version.length != 4){
189
            return null;
190
        }
191
        return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
192
    }
193
    /**
194
     * Creates an StringPrep object after reading the input stream.
195
     * The object does not hold a reference to the input steam, so the stream can be
196
     * closed after the method returns.
197
     *
198
     * @param inputStream The stream for reading the StringPrep profile binarySun
199
     * @throws IOException
200
     * @draft ICU 2.8
201
     */
202
    public StringPrep(InputStream inputStream) throws IOException{
203

204
        BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
205

206
        StringPrepDataReader reader = new StringPrepDataReader(b);
207

208
        // read the indexes
209
        indexes = reader.readIndexes(INDEX_TOP);
210

211
        byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
212

213

214
        //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
215
        mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
216
        // load the rest of the data data and initialize the data members
217
        reader.read(sprepBytes,mappingData);
218

219
        sprepTrieImpl           = new StringPrepTrieImpl();
220
        sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl  );
221

222
        // get the data format version
223
        formatVersion = reader.getDataFormatVersion();
224

225
        // get the options
226
        doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
227
        checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
228
        sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
229
        normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
230
        VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();
231
        if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
232
           normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
233
           ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
234
           ){
235
            throw new IOException("Normalization Correction version not supported");
236
        }
237
        b.close();
238
    }
239

240
    private static final class Values{
241
        boolean isIndex;
242
        int value;
243
        int type;
244
        public void reset(){
245
            isIndex = false;
246
            value = 0;
247
            type = -1;
248
        }
249
    }
250

251
    private static final void getValues(char trieWord,Values values){
252
        values.reset();
253
        if(trieWord == 0){
254
            /*
255
             * Initial value stored in the mapping table
256
             * just return TYPE_LIMIT .. so that
257
             * the source codepoint is copied to the destination
258
             */
259
            values.type = TYPE_LIMIT;
260
        }else if(trieWord >= TYPE_THRESHOLD){
261
            values.type = (trieWord - TYPE_THRESHOLD);
262
        }else{
263
            /* get the type */
264
            values.type = MAP;
265
            /* ascertain if the value is index or delta */
266
            if((trieWord & 0x02)>0){
267
                values.isIndex = true;
268
                values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
269

270
            }else{
271
                values.isIndex = false;
272
                values.value = (trieWord<<16)>>16;
273
                values.value =  (values.value >> 2);
274

275
            }
276

277
            if((trieWord>>2) == MAX_INDEX_VALUE){
278
                values.type = DELETE;
279
                values.isIndex = false;
280
                values.value = 0;
281
            }
282
        }
283
    }
284

285

286

287
    private StringBuffer map( UCharacterIterator iter, int options)
288
                            throws ParseException {
289

290
        Values val = new Values();
291
        char result = 0;
292
        int ch  = UCharacterIterator.DONE;
293
        StringBuffer dest = new StringBuffer();
294
        boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
295

296
        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
297

298
            result = getCodePointValue(ch);
299
            getValues(result,val);
300

301
            // check if the source codepoint is unassigned
302
            if(val.type == UNASSIGNED && allowUnassigned == false){
303
                 throw new ParseException("An unassigned code point was found in the input " +
304
                                          iter.getText(), iter.getIndex());
305
            }else if((val.type == MAP)){
306
                int index, length;
307

308
                if(val.isIndex){
309
                    index = val.value;
310
                    if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
311
                             index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
312
                        length = 1;
313
                    }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
314
                             index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
315
                        length = 2;
316
                    }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
317
                             index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
318
                        length = 3;
319
                    }else{
320
                        length = mappingData[index++];
321
                    }
322
                    /* copy mapping to destination */
323
                    dest.append(mappingData,index,length);
324
                    continue;
325

326
                }else{
327
                    ch -= val.value;
328
                }
329
            }else if(val.type == DELETE){
330
                // just consume the codepoint and contine
331
                continue;
332
            }
333
            //copy the source into destination
334
            UTF16.append(dest,ch);
335
        }
336

337
        return dest;
338
    }
339

340

341
    private StringBuffer normalize(StringBuffer src){
342
        /*
343
         * Option UNORM_BEFORE_PRI_29:
344
         *
345
         * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
346
         * requires strict adherence to Unicode 3.2 normalization,
347
         * including buggy composition from before fixing Public Review Issue #29.
348
         * Note that this results in some valid but nonsensical text to be
349
         * either corrupted or rejected, depending on the text.
350
         * See http://www.unicode.org/review/resolved-pri.html#pri29
351
         * See unorm.cpp and cnormtst.c
352
         */
353
        return new StringBuffer(
354
            Normalizer.normalize(
355
                src.toString(),
356
                java.text.Normalizer.Form.NFKC,
357
                Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
358
    }
359
    /*
360
    boolean isLabelSeparator(int ch){
361
        int result = getCodePointValue(ch);
362
        if( (result & 0x07)  == LABEL_SEPARATOR){
363
            return true;
364
        }
365
        return false;
366
    }
367
    */
368
     /*
369
       1) Map -- For each character in the input, check if it has a mapping
370
          and, if so, replace it with its mapping.
371

372
       2) Normalize -- Possibly normalize the result of step 1 using Unicode
373
          normalization.
374

375
       3) Prohibit -- Check for any characters that are not allowed in the
376
          output.  If any are found, return an error.
377

378
       4) Check bidi -- Possibly check for right-to-left characters, and if
379
          any are found, make sure that the whole string satisfies the
380
          requirements for bidirectional strings.  If the string does not
381
          satisfy the requirements for bidirectional strings, return an
382
          error.
383
          [Unicode3.2] defines several bidirectional categories; each character
384
           has one bidirectional category assigned to it.  For the purposes of
385
           the requirements below, an "RandALCat character" is a character that
386
           has Unicode bidirectional categories "R" or "AL"; an "LCat character"
387
           is a character that has Unicode bidirectional category "L".  Note
388

389

390
           that there are many characters which fall in neither of the above
391
           definitions; Latin digits (<U+0030> through <U+0039>) are examples of
392
           this because they have bidirectional category "EN".
393

394
           In any profile that specifies bidirectional character handling, all
395
           three of the following requirements MUST be met:
396

397
           1) The characters in section 5.8 MUST be prohibited.
398

399
           2) If a string contains any RandALCat character, the string MUST NOT
400
              contain any LCat character.
401

402
           3) If a string contains any RandALCat character, a RandALCat
403
              character MUST be the first character of the string, and a
404
              RandALCat character MUST be the last character of the string.
405
    */
406
    /**
407
     * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
408
     * checks for prohited and BiDi characters in the order defined by RFC 3454
409
     * depending on the options specified in the profile.
410
     *
411
     * @param src           A UCharacterIterator object containing the source string
412
     * @param options       A bit set of options:
413
     *
414
     *  - StringPrep.NONE               Prohibit processing of unassigned code points in the input
415
     *
416
     *  - StringPrep.ALLOW_UNASSIGNED   Treat the unassigned code points are in the input
417
     *                                  as normal Unicode code points.
418
     *
419
     * @return StringBuffer A StringBuffer containing the output
420
     * @throws ParseException
421
     * @draft ICU 2.8
422
     */
423
    public StringBuffer prepare(UCharacterIterator src, int options)
424
                        throws ParseException{
425

426
        // map
427
        StringBuffer mapOut = map(src,options);
428
        StringBuffer normOut = mapOut;// initialize
429

430
        if(doNFKC){
431
            // normalize
432
            normOut = normalize(mapOut);
433
        }
434

435
        int ch;
436
        char result;
437
        UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
438
        Values val = new Values();
439
        int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
440
            firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
441
        int rtlPos=-1, ltrPos=-1;
442
        boolean rightToLeft=false, leftToRight=false;
443

444
        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
445
            result = getCodePointValue(ch);
446
            getValues(result,val);
447

448
            if(val.type == PROHIBITED ){
449
                throw new ParseException("A prohibited code point was found in the input" +
450
                                         iter.getText(), val.value);
451
            }
452

453
            direction = UCharacter.getDirection(ch);
454
            if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
455
                firstCharDir = direction;
456
            }
457
            if(direction == UCharacterDirection.LEFT_TO_RIGHT){
458
                leftToRight = true;
459
                ltrPos = iter.getIndex()-1;
460
            }
461
            if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
462
                rightToLeft = true;
463
                rtlPos = iter.getIndex()-1;
464
            }
465
        }
466
        if(checkBiDi == true){
467
            // satisfy 2
468
            if( leftToRight == true && rightToLeft == true){
469
                throw new ParseException("The input does not conform to the rules for BiDi code points." +
470
                                         iter.getText(),
471
                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
472
             }
473

474
            //satisfy 3
475
            if( rightToLeft == true &&
476
                !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
477
                (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
478
              ){
479
                throw new ParseException("The input does not conform to the rules for BiDi code points." +
480
                                         iter.getText(),
481
                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
482
            }
483
        }
484
        return normOut;
485

486
      }
487
}
488

489
Product

Resources

Company