Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/net/idn/StringPrep.java
38918 views
1
/*
2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3
*
4
* This code is free software; you can redistribute it and/or modify it
5
* under the terms of the GNU General Public License version 2 only, as
6
* published by the Free Software Foundation. Oracle designates this
7
* particular file as subject to the "Classpath" exception as provided
8
* by Oracle in the LICENSE file that accompanied this code.
9
*
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
15
*
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
*
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
22
* questions.
23
*/
24
/*
25
/*
26
*******************************************************************************
27
* Copyright (C) 2003-2004, International Business Machines Corporation and *
28
* others. All Rights Reserved. *
29
*******************************************************************************
30
*/
31
//
32
// CHANGELOG
33
// 2005-05-19 Edward Wang
34
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
35
// - move from package com.ibm.icu.text to package sun.net.idn
36
// - use ParseException instead of StringPrepParseException
37
// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
38
// - remove all @deprecated tag to make compiler happy
39
// 2007-08-14 Martin Buchholz
40
// - remove redundant casts
41
//
42
package sun.net.idn;
43
44
import java.io.BufferedInputStream;
45
import java.io.ByteArrayInputStream;
46
import java.io.IOException;
47
import java.io.InputStream;
48
import java.text.ParseException;
49
50
import sun.text.Normalizer;
51
import sun.text.normalizer.CharTrie;
52
import sun.text.normalizer.Trie;
53
import sun.text.normalizer.NormalizerImpl;
54
import sun.text.normalizer.VersionInfo;
55
import sun.text.normalizer.UCharacter;
56
import sun.text.normalizer.UCharacterIterator;
57
import sun.text.normalizer.UTF16;
58
import sun.net.idn.UCharacterDirection;
59
import sun.net.idn.StringPrepDataReader;
60
61
/**
62
* StringPrep API implements the StingPrep framework as described by
63
* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
64
* StringPrep prepares Unicode strings for use in network protocols.
65
* Profiles of StingPrep are set of rules and data according to which the
66
* Unicode Strings are prepared. Each profiles contains tables which describe
67
* how a code point should be treated. The tables are broadly classied into
68
* <ul>
69
* <li> Unassigned Table: Contains code points that are unassigned
70
* in the Unicode Version supported by StringPrep. Currently
71
* RFC 3454 supports Unicode 3.2. </li>
72
* <li> Prohibited Table: Contains code points that are prohibted from
73
* the output of the StringPrep processing function. </li>
74
* <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
75
* </ul>
76
*
77
* The procedure for preparing Unicode strings:
78
* <ol>
79
* <li> Map: For each character in the input, check if it has a mapping
80
* and, if so, replace it with its mapping. </li>
81
* <li> Normalize: Possibly normalize the result of step 1 using Unicode
82
* normalization. </li>
83
* <li> Prohibit: Check for any characters that are not allowed in the
84
* output. If any are found, return an error.</li>
85
* <li> Check bidi: Possibly check for right-to-left characters, and if
86
* any are found, make sure that the whole string satisfies the
87
* requirements for bidirectional strings. If the string does not
88
* satisfy the requirements for bidirectional strings, return an
89
* error. </li>
90
* </ol>
91
* @author Ram Viswanadha
92
* @draft ICU 2.8
93
*/
94
public final class StringPrep {
95
/**
96
* Option to prohibit processing of unassigned code points in the input
97
*
98
* @see #prepare
99
* @draft ICU 2.8
100
*/
101
public static final int DEFAULT = 0x0000;
102
103
/**
104
* Option to allow processing of unassigned code points in the input
105
*
106
* @see #prepare
107
* @draft ICU 2.8
108
*/
109
public static final int ALLOW_UNASSIGNED = 0x0001;
110
111
private static final int UNASSIGNED = 0x0000;
112
private static final int MAP = 0x0001;
113
private static final int PROHIBITED = 0x0002;
114
private static final int DELETE = 0x0003;
115
private static final int TYPE_LIMIT = 0x0004;
116
117
private static final int NORMALIZATION_ON = 0x0001;
118
private static final int CHECK_BIDI_ON = 0x0002;
119
120
private static final int TYPE_THRESHOLD = 0xFFF0;
121
private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/
122
private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
123
124
/* indexes[] value names */
125
private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
126
private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
127
private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
128
private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */
129
private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */
130
private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
131
private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
132
private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
133
private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
134
135
136
/**
137
* Default buffer size of datafile
138
*/
139
private static final int DATA_BUFFER_SIZE = 25000;
140
141
/* Wrappers for Trie implementations */
142
private static final class StringPrepTrieImpl implements Trie.DataManipulate{
143
private CharTrie sprepTrie = null;
144
/**
145
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
146
* data the index array offset of the indexes for that lead surrogate.
147
* @param property data value for a surrogate from the trie, including
148
* the folding offset
149
* @return data offset or 0 if there is no data for the lead surrogate
150
*/
151
public int getFoldingOffset(int value){
152
return value;
153
}
154
}
155
156
// CharTrie implementation for reading the trie data
157
private StringPrepTrieImpl sprepTrieImpl;
158
// Indexes read from the data file
159
private int[] indexes;
160
// mapping data read from the data file
161
private char[] mappingData;
162
// format version of the data file
163
private byte[] formatVersion;
164
// the version of Unicode supported by the data file
165
private VersionInfo sprepUniVer;
166
// the Unicode version of last entry in the
167
// NormalizationCorrections.txt file if normalization
168
// is turned on
169
private VersionInfo normCorrVer;
170
// Option to turn on Normalization
171
private boolean doNFKC;
172
// Option to turn on checking for BiDi rules
173
private boolean checkBiDi;
174
175
176
private char getCodePointValue(int ch){
177
return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
178
}
179
180
private static VersionInfo getVersionInfo(int comp){
181
int micro = comp & 0xFF;
182
int milli =(comp >> 8) & 0xFF;
183
int minor =(comp >> 16) & 0xFF;
184
int major =(comp >> 24) & 0xFF;
185
return VersionInfo.getInstance(major,minor,milli,micro);
186
}
187
private static VersionInfo getVersionInfo(byte[] version){
188
if(version.length != 4){
189
return null;
190
}
191
return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
192
}
193
/**
194
* Creates an StringPrep object after reading the input stream.
195
* The object does not hold a reference to the input steam, so the stream can be
196
* closed after the method returns.
197
*
198
* @param inputStream The stream for reading the StringPrep profile binarySun
199
* @throws IOException
200
* @draft ICU 2.8
201
*/
202
public StringPrep(InputStream inputStream) throws IOException{
203
204
BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
205
206
StringPrepDataReader reader = new StringPrepDataReader(b);
207
208
// read the indexes
209
indexes = reader.readIndexes(INDEX_TOP);
210
211
byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
212
213
214
//indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
215
mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
216
// load the rest of the data data and initialize the data members
217
reader.read(sprepBytes,mappingData);
218
219
sprepTrieImpl = new StringPrepTrieImpl();
220
sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl );
221
222
// get the data format version
223
formatVersion = reader.getDataFormatVersion();
224
225
// get the options
226
doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
227
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
228
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
229
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
230
VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();
231
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
232
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
233
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
234
){
235
throw new IOException("Normalization Correction version not supported");
236
}
237
b.close();
238
}
239
240
private static final class Values{
241
boolean isIndex;
242
int value;
243
int type;
244
public void reset(){
245
isIndex = false;
246
value = 0;
247
type = -1;
248
}
249
}
250
251
private static final void getValues(char trieWord,Values values){
252
values.reset();
253
if(trieWord == 0){
254
/*
255
* Initial value stored in the mapping table
256
* just return TYPE_LIMIT .. so that
257
* the source codepoint is copied to the destination
258
*/
259
values.type = TYPE_LIMIT;
260
}else if(trieWord >= TYPE_THRESHOLD){
261
values.type = (trieWord - TYPE_THRESHOLD);
262
}else{
263
/* get the type */
264
values.type = MAP;
265
/* ascertain if the value is index or delta */
266
if((trieWord & 0x02)>0){
267
values.isIndex = true;
268
values.value = trieWord >> 2; //mask off the lower 2 bits and shift
269
270
}else{
271
values.isIndex = false;
272
values.value = (trieWord<<16)>>16;
273
values.value = (values.value >> 2);
274
275
}
276
277
if((trieWord>>2) == MAX_INDEX_VALUE){
278
values.type = DELETE;
279
values.isIndex = false;
280
values.value = 0;
281
}
282
}
283
}
284
285
286
287
private StringBuffer map( UCharacterIterator iter, int options)
288
throws ParseException {
289
290
Values val = new Values();
291
char result = 0;
292
int ch = UCharacterIterator.DONE;
293
StringBuffer dest = new StringBuffer();
294
boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
295
296
while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
297
298
result = getCodePointValue(ch);
299
getValues(result,val);
300
301
// check if the source codepoint is unassigned
302
if(val.type == UNASSIGNED && allowUnassigned == false){
303
throw new ParseException("An unassigned code point was found in the input " +
304
iter.getText(), iter.getIndex());
305
}else if((val.type == MAP)){
306
int index, length;
307
308
if(val.isIndex){
309
index = val.value;
310
if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
311
index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
312
length = 1;
313
}else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
314
index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
315
length = 2;
316
}else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
317
index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
318
length = 3;
319
}else{
320
length = mappingData[index++];
321
}
322
/* copy mapping to destination */
323
dest.append(mappingData,index,length);
324
continue;
325
326
}else{
327
ch -= val.value;
328
}
329
}else if(val.type == DELETE){
330
// just consume the codepoint and contine
331
continue;
332
}
333
//copy the source into destination
334
UTF16.append(dest,ch);
335
}
336
337
return dest;
338
}
339
340
341
private StringBuffer normalize(StringBuffer src){
342
/*
343
* Option UNORM_BEFORE_PRI_29:
344
*
345
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
346
* requires strict adherence to Unicode 3.2 normalization,
347
* including buggy composition from before fixing Public Review Issue #29.
348
* Note that this results in some valid but nonsensical text to be
349
* either corrupted or rejected, depending on the text.
350
* See http://www.unicode.org/review/resolved-pri.html#pri29
351
* See unorm.cpp and cnormtst.c
352
*/
353
return new StringBuffer(
354
Normalizer.normalize(
355
src.toString(),
356
java.text.Normalizer.Form.NFKC,
357
Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
358
}
359
/*
360
boolean isLabelSeparator(int ch){
361
int result = getCodePointValue(ch);
362
if( (result & 0x07) == LABEL_SEPARATOR){
363
return true;
364
}
365
return false;
366
}
367
*/
368
/*
369
1) Map -- For each character in the input, check if it has a mapping
370
and, if so, replace it with its mapping.
371
372
2) Normalize -- Possibly normalize the result of step 1 using Unicode
373
normalization.
374
375
3) Prohibit -- Check for any characters that are not allowed in the
376
output. If any are found, return an error.
377
378
4) Check bidi -- Possibly check for right-to-left characters, and if
379
any are found, make sure that the whole string satisfies the
380
requirements for bidirectional strings. If the string does not
381
satisfy the requirements for bidirectional strings, return an
382
error.
383
[Unicode3.2] defines several bidirectional categories; each character
384
has one bidirectional category assigned to it. For the purposes of
385
the requirements below, an "RandALCat character" is a character that
386
has Unicode bidirectional categories "R" or "AL"; an "LCat character"
387
is a character that has Unicode bidirectional category "L". Note
388
389
390
that there are many characters which fall in neither of the above
391
definitions; Latin digits (<U+0030> through <U+0039>) are examples of
392
this because they have bidirectional category "EN".
393
394
In any profile that specifies bidirectional character handling, all
395
three of the following requirements MUST be met:
396
397
1) The characters in section 5.8 MUST be prohibited.
398
399
2) If a string contains any RandALCat character, the string MUST NOT
400
contain any LCat character.
401
402
3) If a string contains any RandALCat character, a RandALCat
403
character MUST be the first character of the string, and a
404
RandALCat character MUST be the last character of the string.
405
*/
406
/**
407
* Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
408
* checks for prohited and BiDi characters in the order defined by RFC 3454
409
* depending on the options specified in the profile.
410
*
411
* @param src A UCharacterIterator object containing the source string
412
* @param options A bit set of options:
413
*
414
* - StringPrep.NONE Prohibit processing of unassigned code points in the input
415
*
416
* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
417
* as normal Unicode code points.
418
*
419
* @return StringBuffer A StringBuffer containing the output
420
* @throws ParseException
421
* @draft ICU 2.8
422
*/
423
public StringBuffer prepare(UCharacterIterator src, int options)
424
throws ParseException{
425
426
// map
427
StringBuffer mapOut = map(src,options);
428
StringBuffer normOut = mapOut;// initialize
429
430
if(doNFKC){
431
// normalize
432
normOut = normalize(mapOut);
433
}
434
435
int ch;
436
char result;
437
UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
438
Values val = new Values();
439
int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
440
firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
441
int rtlPos=-1, ltrPos=-1;
442
boolean rightToLeft=false, leftToRight=false;
443
444
while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
445
result = getCodePointValue(ch);
446
getValues(result,val);
447
448
if(val.type == PROHIBITED ){
449
throw new ParseException("A prohibited code point was found in the input" +
450
iter.getText(), val.value);
451
}
452
453
direction = UCharacter.getDirection(ch);
454
if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
455
firstCharDir = direction;
456
}
457
if(direction == UCharacterDirection.LEFT_TO_RIGHT){
458
leftToRight = true;
459
ltrPos = iter.getIndex()-1;
460
}
461
if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
462
rightToLeft = true;
463
rtlPos = iter.getIndex()-1;
464
}
465
}
466
if(checkBiDi == true){
467
// satisfy 2
468
if( leftToRight == true && rightToLeft == true){
469
throw new ParseException("The input does not conform to the rules for BiDi code points." +
470
iter.getText(),
471
(rtlPos>ltrPos) ? rtlPos : ltrPos);
472
}
473
474
//satisfy 3
475
if( rightToLeft == true &&
476
!((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
477
(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
478
){
479
throw new ParseException("The input does not conform to the rules for BiDi code points." +
480
iter.getText(),
481
(rtlPos>ltrPos) ? rtlPos : ltrPos);
482
}
483
}
484
return normOut;
485
486
}
487
}
488
489