Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/make/src/classes/build/tools/generatecharacter/GenerateCharacter.java
32287 views
1
/*
2
* Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
26
package build.tools.generatecharacter;
27
28
import java.io.IOException;
29
import java.io.FileNotFoundException;
30
import java.io.BufferedReader;
31
import java.io.FileReader;
32
import java.io.PrintWriter;
33
import java.io.BufferedWriter;
34
import java.io.FileWriter;
35
import java.io.File;
36
import java.util.List;
37
38
import build.tools.generatecharacter.CharacterName;
39
40
/**
41
* This program generates the source code for the class java.lang.Character.
42
* It also generates native C code that can perform the same operations.
43
* It requires two external input data files:
44
* <ul>
45
* <li> Unicode specification file
46
* <li> Character class template file
47
* </ul>
48
* The Unicode specification file is available from the Unicode consortium.
49
* It has character specification lines that look like this:
50
* <listing>
51
* 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
52
* </listing>
53
* The Character class template file is filled in with additional
54
* information to produce the file Character.java, which can then be
55
* compiled by a Java compiler. The template file contains certain
56
* markers consisting of an alphabetic name string preceded by "$$".
57
* Such markers are replaced with generated program text. As a special
58
* case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
59
* alphabetic characters constituting a variable name. The character "_"
60
* is considered alphabetic for these purposes.
61
*
62
* @author Guy Steele
63
* @author Alan Liu
64
* @author John O'Conner
65
*/
66
67
public class GenerateCharacter {
68
69
final static boolean DEBUG = false;
70
71
final static String commandMarker = "$$";
72
static String ROOT = "";
73
static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt";
74
static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
75
static String DefaultPropListFileName = ROOT + "PropList.txt";
76
static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
77
static String DefaultJavaOutputFileName = ROOT + "Character.java";
78
static String DefaultCTemplateFileName = ROOT + "Character.c.template";
79
static String DefaultCOutputFileName = ROOT + "Character.c";
80
81
static int plane = 0;
82
83
/* The overall idea is that, in the generated Character class source code,
84
most character property data is stored in a special multi-level table whose
85
structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
86
The integers must sum to 16 (the number of bits in a character).
87
The first table is indexed by the k1 high-order bits of the character code.
88
The result is concatenated to the next k2 bits of the character code to index
89
the second table, and so on. Eventually the kn low-order bits of the character
90
code are concatenated and used to index one of two tables A and B; A contains
91
32-bit integer entries and B contains 16-bit short entries. The 48 bits that
92
can be thus obtained encode the properties for the character.
93
94
The default specification is [9, 4, 3, 0]. This particular table format was
95
designed by conducting an exhaustive search of table formats to minimize the
96
space consumed by the tables: the first and third tables need have only byte
97
values (the second table must have short values). Another good choice is
98
[10, 6, 0], which produces a larger table but allows particularly fast table
99
lookup code.
100
101
In each case, where the word "concatenated" is used, this may imply
102
first a << and then a | operation, or perhaps just a | operation if
103
the values in the table can be preshifted (generally possible if the table
104
entries are short rather than byte).
105
*/
106
107
/* The character properties are currently encoded into A (32 bits)and B (16 bits)
108
two parts.
109
110
A: the low 32 bits are defined in the following manner:
111
112
1 bit Mirrored property.
113
4 bits Bidirectional category (see below) (unused if -nobidi switch specified)
114
9 bits A signed offset used for converting case .
115
1 bit If 1, adding the signed offset converts the character to lowercase.
116
1 bit If 1, subtracting the signed offset converts the character to uppercase.
117
Note: for a titlecase character, both of the preceding bits will be 1
118
and the signed offset will be 1.
119
1 bit If 1, this character has a titlecase equivalent (possibly itself);
120
in this case, the two bits before this bit can be used to decide
121
whether this character is in fact uppercase, lowercase, or titlecase.
122
3 bits This field provides a quick way to lex identifiers.
123
The eight possible values for this field are as follows:
124
0 May not be part of an identifier
125
1 Ignorable control; may continue a Unicode identifier or Java identifier
126
2 May continue a Java identifier but not a Unicode identifier (unused)
127
3 May continue a Unicode identifier or Java identifier
128
4 Is a Java whitespace character
129
5 May start or continue a Java identifier;
130
may continue but not start a Unicode identifier
131
(this value is used for connector punctuation such as _)
132
6 May start or continue a Java identifier;
133
may not occur in a Unicode identifier
134
(this value is used for currency symbols such as $)
135
7 May start or continue a Unicode identifier or Java identifier
136
Thus:
137
5, 6, 7 may start a Java identifier
138
1, 2, 3, 5, 6, 7 may continue a Java identifier
139
7 may start a Unicode identifier
140
1, 3, 5, 7 may continue a Unicode identifier
141
1 is ignorable within an identifier
142
4 is Java whitespace
143
2 bits This field indicates whether the character has a numeric property.
144
The four possible values for this field are as follows:
145
0 This character has no numeric property.
146
1 Adding the digit offset to the character code and then
147
masking with 0x1F will produce the desired numeric value.
148
2 This character has a "strange" numeric value.
149
3 A Java supradecimal digit: adding the digit offset to the
150
character code, then masking with 0x1F, then adding 10
151
will produce the desired numeric value.
152
5 bits The digit offset (see description of previous field)
153
5 bits Character type (see below)
154
155
B: the high 16 bits are defined as:
156
1 bit Other_Lowercase property
157
1 bit Other_Uppercase property
158
1 bit Other_Alphabetic property
159
1 bit Other_Math property
160
1 bit Ideographic property
161
1 bit Noncharacter codepoint property
162
*/
163
164
165
// bit masks identify each component of a 32-bit property field described
166
// above.
167
// shift* indicates how many shifts right must happen to get the
168
// indicated property value in the lowest bits of the 32-bit space.
169
private static final int
170
shiftType = 0, maskType = 0x001F,
171
shiftDigitOffset = 5, maskDigitOffset = 0x03E0,
172
shiftNumericType = 10, maskNumericType = 0x0C00,
173
shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000,
174
maskUnicodePart = 0x1000,
175
shiftCaseInfo = 15, maskCaseInfo = 0x38000,
176
maskLowerCase = 0x20000,
177
maskUpperCase = 0x10000,
178
maskTitleCase = 0x08000,
179
shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000,
180
shiftCaseOffsetSign = 5,
181
// used only when calculating and
182
// storing digit offsets from char values
183
maskDigit = 0x001F,
184
// case offset are 9 bits
185
maskCase = 0x01FF,
186
shiftBidi = 27, maskBidi = 0x78000000,
187
shiftMirrored = 31, //maskMirrored = 0x80000000,
188
shiftPlane = 16, maskPlane = 0xFF0000;
189
190
// maskMirrored needs to be long, if up 16-bit
191
private static final long maskMirrored = 0x80000000L;
192
193
// bit masks identify the 16-bit priperty field described above, in B
194
// table
195
private static final long
196
maskOtherLowercase = 0x100000000L,
197
maskOtherUppercase = 0x200000000L,
198
maskOtherAlphabetic = 0x400000000L,
199
maskOtherMath = 0x800000000L,
200
maskIdeographic = 0x1000000000L,
201
maskNoncharacterCP = 0x2000000000L;
202
203
// Can compare masked values with these to determine
204
// numeric or lexical types.
205
public static int
206
valueNotNumeric = 0x0000,
207
valueDigit = 0x0400,
208
valueStrangeNumeric = 0x0800,
209
valueJavaSupradecimal = 0x0C00,
210
valueIgnorable = 0x1000,
211
valueJavaOnlyPart = 0x2000,
212
valueJavaUnicodePart = 0x3000,
213
valueJavaWhitespace = 0x4000,
214
valueJavaStartUnicodePart = 0x5000,
215
valueJavaOnlyStart = 0x6000,
216
valueJavaUnicodeStart = 0x7000,
217
lowJavaStart = 0x5000,
218
nonzeroJavaPart = 0x3000,
219
valueUnicodeStart = 0x7000;
220
221
// these values are used when only identifier properties are generated
222
// for use in verifier code. Shortens the property down to a single byte.
223
private static final int
224
bitJavaStart = 0x02,
225
bitJavaPart = 0x01,
226
maskIsJavaIdentifierPart = bitJavaPart,
227
maskIsJavaIdentifierStart = bitJavaStart;
228
229
static int maxOffset = maskCase/2 ;
230
static int minOffset = -maxOffset;
231
232
/* The following routines provide simple, concise formatting of long integer values.
233
The number in the name of the method indicates the desired number of characters
234
to be produced. If the number of digits required to represent the integer value
235
is less than that number, then the output is padded on the left with zeros
236
(for hex) or with spaces (for decimal). If the number of digits required to
237
represent the integer value is greater than the desired number, then all the digits
238
that are required are actually produced.
239
*/
240
241
static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
242
243
static String hex2(long n) {
244
String q = Long.toHexString(n & 0xFF).toUpperCase();
245
return "00".substring(Math.min(2, q.length())) + q;
246
}
247
248
static String hex4(long n) {
249
String q = Long.toHexString(n & 0xFFFF).toUpperCase();
250
return "0000".substring(Math.min(4, q.length())) + q;
251
}
252
253
static String hex8(long n) {
254
String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
255
return "00000000".substring(Math.min(8, q.length())) + q;
256
}
257
258
static String hex16(long n) {
259
String q = Long.toHexString(n).toUpperCase();
260
return "0000000000000000".substring(Math.min(16, q.length())) + q;
261
}
262
263
static String dec3(long n) {
264
String q = Long.toString(n);
265
return " ".substring(Math.min(3, q.length())) + q;
266
}
267
268
static String dec5(long n) {
269
String q = Long.toString(n);
270
return " ".substring(Math.min(5, q.length())) + q;
271
}
272
273
/* This routine is called when some failure occurs. */
274
275
static void FAIL(String s) {
276
System.out.println("** " + s);
277
}
278
279
/**
280
* Given the data from the Unicode specification file, this routine builds a map.
281
*
282
* The specification file is assumed to contain its data in sorted order by
283
* character code; as a result, the array passed as an argument to this method
284
* has its components in the same sorted order, with one entry for each defined
285
* Unicode character or character range. (A range is indicated by two consecutive
286
* entries, such that the name of the first entry begins with "<" and ends with
287
* "First>" and the second entry begins with "<" and ends with "Last>".) This is
288
* therefore a sparse representation of the character property data.
289
*
290
* The resulting map is dense representation of the character data. It contains
291
* 2^16 = 65536 entries, each of which is a long integer. (Right now only 32 bits
292
* of this long value are used, but type long is used rather than int to facilitate
293
* future extensions of this source code generator that might require more than
294
* 32 bits to encode relevant character properties.) Entry k holds the encoded
295
* properties for character k.
296
*
297
* Method buildMap manages the transformation from the sparse representation to
298
* the dense representation. It calls method buildOne to handle the encoding
299
* of character property data from a single UnicodeSpec object into 32 bits.
300
* For undefined characters, method buildOne is not called and the map entry for
301
* that character is set to UnicodeSpec.UNASSIGNED.
302
*
303
* @param data character property data from the Unicode specification file
304
* @return an array of length 65536 with one entry for every possible char value
305
*
306
* @see GenerateCharacter#buildOne
307
*/
308
309
static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
310
{
311
long[] result;
312
if (bLatin1 == true) {
313
result = new long[256];
314
} else {
315
result = new long[1<<16];
316
}
317
int k=0;
318
int codePoint = plane<<16;
319
UnicodeSpec nonCharSpec = new UnicodeSpec();
320
for (int j = 0; j < data.length && k < result.length; j++) {
321
if (data[j].codePoint == codePoint) {
322
result[k] = buildOne(codePoint, data[j], specialMaps);
323
++k;
324
++codePoint;
325
}
326
else if(data[j].codePoint > codePoint) {
327
if (data[j].name.endsWith("Last>")) {
328
// build map data for all chars except last in range
329
while (codePoint < data[j].codePoint && k < result.length) {
330
result[k] = buildOne(codePoint, data[j], specialMaps);
331
++k;
332
++codePoint;
333
}
334
}
335
else {
336
// we have a few unassigned chars before data[j].codePoint
337
while (codePoint < data[j].codePoint && k < result.length) {
338
result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
339
++k;
340
++codePoint;
341
}
342
}
343
k = data[j].codePoint & 0xFFFF;
344
codePoint = data[j].codePoint;
345
result[k] = buildOne(codePoint, data[j], specialMaps);
346
++k;
347
++codePoint;
348
}
349
else {
350
System.out.println("An error has occured during spec mapping.");
351
System.exit(0);
352
}
353
}
354
// if there are still unprocessed chars, process them
355
// as unassigned/undefined.
356
codePoint = (plane<<16) | k;
357
while (k < result.length) {
358
result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
359
++k;
360
++codePoint;
361
}
362
// now add all extra supported properties from PropList, to the
363
// upper 16-bit
364
addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
365
addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
366
addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
367
addExProp(result, propList, "Ideographic", maskIdeographic);
368
//addExProp(result, propList, "Other_Math", maskOtherMath);
369
//addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
370
371
return result;
372
}
373
374
// The maximum and minimum offsets found while scanning the database
375
static int maxOffsetSeen = 0;
376
static int minOffsetSeen = 0;
377
378
/**
379
* Some Unicode separator characters are not considered Java whitespace.
380
* @param c character to test
381
* @return true if c in an invalid Java whitespace character, false otherwise.
382
*/
383
static boolean isInvalidJavaWhiteSpace(int c) {
384
int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
385
boolean retValue = false;
386
for(int x=0;x<exceptions.length;x++) {
387
if(c == exceptions[x]) {
388
retValue = true;
389
break;
390
}
391
}
392
return retValue;
393
394
}
395
396
/**
397
* Given the character property data for one Unicode character, encode the data
398
* of interest into a single long integer value. (Right now only 32 bits
399
* of this long value are used, but type long is used rather than int to facilitate
400
* future extensions of this source code generator that might require more than
401
* 32 bits to encode relevant character properties.)
402
*
403
* @param c the character code for which to encode property data
404
* @param us property data record from the Unicode specification file
405
* (its character code might not be equal to c if it specifies data
406
* for a range of characters)
407
* @return an encoded long value that contains the properties for a single char
408
*
409
* @see GenerateCharacter#buildMap
410
*/
411
412
static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
413
long resultA = 0;
414
// record the general category
415
resultA |= us.generalCategory;
416
417
// record the numeric properties
418
NUMERIC: {
419
STRANGE: {
420
int val = 0;
421
// c is A-Z
422
if ((c >= 0x0041) && (c <= 0x005A)) {
423
val = c - 0x0041;
424
resultA |= valueJavaSupradecimal;
425
// c is a-z
426
} else if ((c >= 0x0061) && (c <= 0x007A)) {
427
val = c - 0x0061;
428
resultA |= valueJavaSupradecimal;
429
// c is a full-width A-Z
430
} else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
431
val = c - 0xFF21;
432
resultA |= valueJavaSupradecimal;
433
// c is a full-width a-z
434
} else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
435
val = c - 0xFF41;
436
resultA |= valueJavaSupradecimal;
437
} else if (us.isDecimalValue()) {
438
val = us.decimalValue;
439
resultA |= valueDigit;
440
} else if (us.isDigitValue()) {
441
val = us.digitValue;
442
resultA |= valueDigit;
443
} else {
444
if (us.numericValue.length() == 0) {
445
break NUMERIC; // no numeric value at all
446
} else {
447
try {
448
val = Integer.parseInt(us.numericValue);
449
if (val >= 32 || val < 0) break STRANGE;
450
if (c == 0x215F) break STRANGE;
451
} catch(NumberFormatException e) {
452
break STRANGE;
453
}
454
resultA |= valueDigit;
455
}
456
}
457
if (val >= 32 || val < 0) break STRANGE;
458
resultA |= ((val - c & maskDigit) << shiftDigitOffset);
459
break NUMERIC;
460
} // end STRANGE
461
resultA |= valueStrangeNumeric;
462
} // end NUMERIC
463
464
// record case mapping
465
int offset = 0;
466
// might have a 1:M mapping
467
int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
468
boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
469
if (bHasUpper) {
470
resultA |= maskUpperCase;
471
}
472
if (specialMap != -1) {
473
// has mapping, but cannot record the
474
// proper offset; can only flag it and provide special case
475
// code in Character.java
476
offset = -1;
477
}
478
else if (us.hasUpperMap()) {
479
offset = c - us.upperMap;
480
}
481
482
if (us.hasLowerMap()) {
483
resultA |= maskLowerCase;
484
if (offset == 0)
485
offset = us.lowerMap - c;
486
else if (offset != (us.lowerMap - c)) {
487
if (DEBUG) {
488
FAIL("Character " + hex(c) +
489
" has incompatible lowercase and uppercase mappings");
490
}
491
}
492
}
493
if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
494
(bHasUpper && us.hasLowerMap())) {
495
resultA |= maskTitleCase;
496
}
497
if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
498
System.out.println("Warning: Character " + hex4(c) + " has upper but " +
499
"no title case; Java won't know this");
500
}
501
if (offset < minOffsetSeen) minOffsetSeen = offset;
502
if (offset > maxOffsetSeen) maxOffsetSeen = offset;
503
if (offset > maxOffset || offset < minOffset) {
504
if (DEBUG) {
505
FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
506
}
507
offset = maskCase;
508
}
509
resultA |= ((offset & maskCase) << shiftCaseOffset);
510
511
// record lexical info about this character
512
if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
513
|| us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
514
|| us.generalCategory == UnicodeSpec.TITLECASE_LETTER
515
|| us.generalCategory == UnicodeSpec.MODIFIER_LETTER
516
|| us.generalCategory == UnicodeSpec.OTHER_LETTER
517
|| us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
518
resultA |= valueJavaUnicodeStart;
519
}
520
else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
521
|| us.generalCategory == UnicodeSpec.NON_SPACING_MARK
522
|| us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
523
resultA |= valueJavaUnicodePart;
524
}
525
else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
526
resultA |= valueJavaStartUnicodePart;
527
}
528
else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
529
resultA |= valueJavaOnlyStart;
530
}
531
else if (((c >= 0x0000) && (c <= 0x0008))
532
|| ((c >= 0x000E) && (c <= 0x001B))
533
|| ((c >= 0x007F) && (c <= 0x009F))
534
|| us.generalCategory == UnicodeSpec.FORMAT) {
535
resultA |= valueIgnorable;
536
}
537
else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
538
|| us.generalCategory == UnicodeSpec.LINE_SEPARATOR
539
|| us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
540
if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
541
}
542
else if (((c >= 0x0009) && (c <= 0x000D))
543
|| ((c >= 0x001C) && (c <= 0x001F))) {
544
resultA |= valueJavaWhitespace;
545
}
546
547
// record bidi category
548
if (!nobidi) {
549
int tmpBidi =
550
(us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
551
us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
552
resultA |= tmpBidi;
553
}
554
555
// record mirrored property
556
if (!nomirror) {
557
resultA |= us.mirrored ? maskMirrored : 0;
558
}
559
560
if (identifiers) {
561
long replacement = 0;
562
if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
563
replacement |= bitJavaStart;
564
}
565
if ( ((resultA & nonzeroJavaPart) != 0)
566
&& ((resultA & maskIdentifierInfo) != valueIgnorable)) {
567
replacement |= bitJavaPart;
568
}
569
resultA = replacement;
570
}
571
return resultA;
572
}
573
574
static void addExProp(long[] map, PropList propList, String prop, long mask) {
575
List<Integer> cps = propList.codepoints(prop);
576
if (cps != null) {
577
for (Integer cp : cps) {
578
if (cp < map.length)
579
map[cp] |= mask;
580
}
581
}
582
}
583
584
/**
585
* This is the heart of the table compression strategy. The inputs are a map
586
* and a number of bits (size). The map is simply an array of long integer values;
587
* the number of bits indicates how index values for that map are to be split.
588
* The length of the given map must be a multiple of (1 << size). The result is
589
* a new map z and a compressed table t such that for every valid index value k
590
* for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
591
*
592
* In other words, the index k can be split into two parts, namely the "size"
593
* low-order bits and all the remaining high-order bits; the high-order bits are then
594
* remapped by map z to produce an index into table t. In effect, the data of the
595
* original map m is broken up into blocks of size (1<<size); the compression relies
596
* on the expectation that many of these blocks will be identical and therefore need
597
* be represented only once in the compressed table t.
598
*
599
* This method is intended to be used iteratively. The first map to be handed
600
* to it is the one constructed by method buildMap. After that, the first of the
601
* two arrays returned by this method is fed back into it for further compression.
602
* At the end of the iteration, one has a starter map and a sequence of tables.
603
*
604
* The algorithm used to implement this computation is straightforward and not
605
* especially clever. It uses brute-force linear search (the loop labeled MIDDLE)
606
* to locate identical blocks, so overall the time complexity of the algorithm
607
* is quadratic in the length of the input map. Fortunately, speed is not crucial
608
* to this application.
609
*
610
* @param map a map to be compressed
611
* @param size the number of index bits to be split off by the compression
612
* @return an array of length 2 containing two arrays; the first is a new map
613
* and the second is a compressed data table
614
*
615
* @see GenerateCharacter#buildMap
616
*/
617
618
static long[][] buildTable(long[] map, int size) {
619
int n = map.length;
620
if (((n >> size) << size) != n) {
621
FAIL("Length " + n + " is not a multiple of " + (1 << size));
622
}
623
int m = 1 << size;
624
// We know the final length of the new map up front.
625
long[] newmap = new long[n >> size];
626
// The buffer is used temporarily to hold data for the compressed table
627
// because we don't know its final length yet.
628
long[] buffer = new long[n];
629
int ptr = 0;
630
OUTER: for (int i = 0; i < n; i += m) {
631
// For every block of size m in the original map...
632
MIDDLE: for (int j = 0; j < ptr; j += m) {
633
// Find out whether there is already a block just like it in the buffer.
634
for (int k = 0; k < m; k++) {
635
if (buffer[j+k] != map[i+k])
636
continue MIDDLE;
637
}
638
// There is a block just like it at position j, so just
639
// put its index into the new map (thereby sharing it).
640
newmap[i >> size] = (j >> size);
641
continue OUTER;
642
} // end MIDDLE
643
// There is no block just like it already, so add it to
644
// the buffer and put its index into the new map.
645
for (int k = 0; k < m; k++) {
646
buffer[ptr+k] = map[i+k];
647
}
648
newmap[i >> size] = (ptr >> size);
649
ptr += m;
650
} // end OUTER
651
// Now we know how long the compressed table should be,
652
// so create a new array and copy data from the temporary buffer.
653
long[] newdata = new long[ptr];
654
for (int j = 0; j < ptr; j++) {
655
newdata[j] = buffer[j];
656
}
657
// Return the new map and the new data table.
658
long[][] result = { newmap, newdata };
659
return result;
660
}
661
662
/**
663
* Once the compressed tables have been computed, this method reads in a
664
* template file for the source code to be generated and writes out the final
665
* source code by acting as a sort of specialized macro processor.
666
*
667
* The first output line is a comment saying that the file was automatically
668
* generated; it includes a timestamp. All other output is generated by
669
* reading a line from the template file, performing macro replacements,
670
* and then writing the resulting line or lines of code to the output file.
671
*
672
* This method handles the I/O, the timestamp comment, and the locating of
673
* macro calls within each input line. The method replaceCommand is called
674
* to generate replacement text for each macro call.
675
*
676
* Macro calls to be replaced are indicated in the template file by
677
* occurrences of the commandMarker "$$". The rest of the call may consist
678
* of Java letters (including the underscore "_") and also of balanced
679
* parentheses.
680
*
681
* @param theTemplateFileName
682
* the file name for the template input file
683
* @param theOutputFileName
684
* the file name for the source code output file
685
*
686
* @see GenerateCharacter#replaceCommand
687
*/
688
689
static void generateCharacterClass(String theTemplateFileName,
690
String theOutputFileName)
691
throws FileNotFoundException, IOException {
692
BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
693
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
694
out.println(commentStart +
695
" This file was generated AUTOMATICALLY from a template file " +
696
new java.util.Date() + commentEnd);
697
int marklen = commandMarker.length();
698
LOOP: while(true) {
699
try {
700
String line = in.readLine();
701
if (line == null) break LOOP;
702
int pos = 0;
703
int depth = 0;
704
while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
705
int newpos = pos + marklen;
706
char ch = 'x';
707
SCAN: while (newpos < line.length() &&
708
(Character.isJavaIdentifierStart(ch = line.charAt(newpos))
709
|| ch == '(' || (ch == ')' && depth > 0))) {
710
++newpos;
711
if (ch == '(') {
712
++depth;
713
}
714
else if (ch == ')') {
715
--depth;
716
if (depth == 0)
717
break SCAN;
718
}
719
}
720
String replacement = replaceCommand(line.substring(pos + marklen, newpos));
721
line = line.substring(0, pos) + replacement + line.substring(newpos);
722
pos += replacement.length();
723
}
724
out.println(line);
725
}
726
catch (IOException e) {
727
break LOOP;
728
}
729
}
730
in.close();
731
out.close();
732
}
733
734
/**
735
* The replaceCommand method takes a command (a macro call without the
736
* leading marker "$$") and computes replacement text for it.
737
*
738
* Most of the commands are simply names of integer constants that are defined
739
* in the source code of this GenerateCharacter class. The replacement text is
740
* simply the value of the constant as an appropriately formatted integer literal.
741
*
742
* Two cases are more complicated, however. The command "Tables" causes the
743
* final map and compressed tables to be emitted, with elaborate comments
744
* describing their contents. (This is actually handled by method genTables.)
745
* The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
746
* an expression that will return the character property data for the character
747
* whose code is the value of the variable "xxx". (this is handled by method
748
* "genAccess".)
749
*
750
* @param x a command from the template file to be replaced
751
* @return the replacement text, as a String
752
*
753
* @see GenerateCharacter#genTables
754
* @see GenerateCharacter#genAccess
755
* @see GenerateCharacter#generateCharacterClass
756
*/
757
758
static String replaceCommand(String x) {
759
if (x.equals("Tables")) return genTables();
760
if (x.equals("Initializers")) return genInitializers();
761
if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
762
x.substring(x.length()-1).equals(")") )
763
return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
764
if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
765
x.substring(x.length()-1).equals(")") )
766
return genAccess("B", x.substring(9, x.length()-1), 16);
767
if (x.equals("shiftType")) return Long.toString(shiftType);
768
if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
769
if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
770
if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
771
if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
772
if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
773
if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
774
if (x.equals("maskCase")) return "0x" + hex8(maskCase);
775
if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
776
if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
777
if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
778
if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
779
if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
780
if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
781
if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
782
if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
783
if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
784
if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
785
if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
786
if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
787
if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
788
if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
789
if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
790
if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
791
if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
792
if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
793
if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
794
if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
795
if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
796
if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
797
if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
798
if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
799
if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
800
if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
801
if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
802
if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
803
if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
804
if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
805
if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
806
if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
807
if (x.equals("maskType")) return "0x" + hex(maskType);
808
if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
809
if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
810
if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
811
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
812
return Integer.toString(UnicodeSpec.UNASSIGNED);
813
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
814
return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
815
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
816
return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
817
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
818
return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
819
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
820
return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
821
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
822
return Integer.toString(UnicodeSpec.OTHER_LETTER);
823
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
824
return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
825
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
826
return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
827
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
828
return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
829
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
830
return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
831
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
832
return Integer.toString(UnicodeSpec.OTHER_NUMBER);
833
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
834
return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
835
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
836
return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
837
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
838
return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
839
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
840
return Integer.toString(UnicodeSpec.CONTROL);
841
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
842
return Integer.toString(UnicodeSpec.FORMAT);
843
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
844
return Integer.toString(UnicodeSpec.PRIVATE_USE);
845
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
846
return Integer.toString(UnicodeSpec.SURROGATE);
847
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
848
return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
849
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
850
return Integer.toString(UnicodeSpec.START_PUNCTUATION);
851
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
852
return Integer.toString(UnicodeSpec.END_PUNCTUATION);
853
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
854
return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
855
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
856
return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
857
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
858
return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
859
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
860
return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
861
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
862
return Integer.toString(UnicodeSpec.LETTER_NUMBER);
863
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
864
return Integer.toString(UnicodeSpec.MATH_SYMBOL);
865
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
866
return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
867
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
868
return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
869
if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
870
return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
871
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
872
return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
873
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
874
return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
875
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
876
return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
877
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
878
return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
879
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
880
return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
881
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
882
return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
883
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
884
return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
885
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
886
return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
887
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
888
return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
889
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
890
return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
891
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
892
return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
893
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
894
return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
895
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
896
return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
897
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
898
return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
899
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
900
return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
901
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
902
return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
903
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
904
return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
905
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
906
return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
907
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
908
return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
909
FAIL("Unknown text substitution marker " + commandMarker + x);
910
return commandMarker + x;
911
}
912
913
/**
914
* The genTables method generates source code for all the lookup tables
915
* needed to represent the various Unicode character properties.
916
* It simply calls the method genTable once for each table to be generated
917
* and then generates a summary comment.
918
*
919
* @return the replacement text for the "Tables" command, as a String
920
*
921
* @see GenerateCharacter#genTable
922
* @see GenerateCharacter#replaceCommand
923
*/
924
static String genTables() {
925
int n = sizes.length;
926
StringBuffer result = new StringBuffer();
927
// liu : Add a comment showing the source of this table
928
result.append(commentStart + " The following tables and code generated using:" +
929
commentEnd + "\n ");
930
result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n ");
931
932
if (plane == 0 && bLatin1 == false) {
933
genCaseMapTableDeclaration(result);
934
genCaseMapTable(initializers, specialCaseMaps);
935
}
936
int totalBytes = 0;
937
for (int k = 0; k < n - 1; k++) {
938
genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
939
sizes[k+1], false, false, k==0);
940
int s = bytes[k];
941
if (s == 1 && useCharForByte) {
942
s = 2;
943
}
944
totalBytes += tables[k].length * s;
945
}
946
genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
947
sizes[n - 1], false, 0, true, !(identifiers), false);
948
949
// If we ever need more than 32 bits to represent the character properties,
950
// then a table "B" may be needed as well.
951
genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
952
953
totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
954
result.append(commentStart);
955
result.append(" In all, the character property tables require ");
956
result.append(totalBytes).append(" bytes.").append(commentEnd);
957
if (verbose) {
958
System.out.println("The character property tables require "
959
+ totalBytes + " bytes.");
960
}
961
return result.toString();
962
}
963
964
/**
965
* The genInitializers method generates the body of the
966
* ensureInitted() method, which enables lazy initialization of
967
* the case map table and other tables.
968
*/
969
static String genInitializers() {
970
return initializers.toString();
971
}
972
973
/**
974
* Return the total number of bytes needed by all tables. This is a stripped-
975
* down copy of genTables().
976
*/
977
static int getTotalBytes() {
978
int n = sizes.length;
979
int totalBytes = 0;
980
for (int k = 0; k < n - 1; k++) {
981
totalBytes += tables[k].length * bytes[k];
982
}
983
totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
984
+ 31) >> 5) << 2);
985
return totalBytes;
986
}
987
988
static void appendEscapedStringFragment(StringBuffer result,
989
char[] line,
990
int length,
991
boolean lastFragment) {
992
result.append(" \"");
993
for (int k=0; k<length; ++k) {
994
result.append("\\u");
995
result.append(hex4(line[k]));
996
}
997
result.append("\"");
998
result.append(lastFragment ? ";" : "+");
999
result.append("\n");
1000
}
1001
1002
static String SMALL_INITIALIZER =
1003
" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1004
// " $$name = new $$type[$$size];\n"+
1005
" int len = $$name_DATA.length();\n"+
1006
" int j=0;\n"+
1007
" for (int i=0; i<len; ++i) {\n"+
1008
" int c = $$name_DATA.charAt(i);\n"+
1009
" for (int k=0; k<$$entriesPerChar; ++k) {\n"+
1010
" $$name[j++] = ($$type)c;\n"+
1011
" c >>= $$bits;\n"+
1012
" }\n"+
1013
" }\n"+
1014
" assert (j == $$size);\n"+
1015
" }\n";
1016
1017
static String SAME_SIZE_INITIALIZER =
1018
" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1019
" assert ($$name_DATA.length() == $$size);\n"+
1020
// " $$name = new $$type[$$size];\n"+
1021
" for (int i=0; i<$$size; ++i)\n"+
1022
" $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
1023
" }\n";
1024
1025
static String BIG_INITIALIZER =
1026
" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1027
// " $$name = new $$type[$$size];\n"+
1028
" int len = $$name_DATA.length();\n"+
1029
" int j=0;\n"+
1030
" int charsInEntry=0;\n"+
1031
" $$type entry=0;\n"+
1032
" for (int i=0; i<len; ++i) {\n"+
1033
" entry |= $$name_DATA.charAt(i);\n"+
1034
" if (++charsInEntry == $$charsPerEntry) {\n"+
1035
" $$name[j++] = entry;\n"+
1036
" entry = 0;\n"+
1037
" charsInEntry = 0;\n"+
1038
" }\n"+
1039
" else {\n"+
1040
" entry <<= 16;\n"+
1041
" }\n"+
1042
" }\n"+
1043
" assert (j == $$size);\n"+
1044
" }\n";
1045
1046
static String INT32_INITIALIZER =
1047
" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1048
" char[] data = $$name_DATA.toCharArray();\n"+
1049
" assert (data.length == ($$size * 2));\n"+
1050
" int i = 0, j = 0;\n"+
1051
" while (i < ($$size * 2)) {\n"+
1052
" int entry = data[i++] << 16;\n"+
1053
" $$name[j++] = entry | data[i++];\n"+
1054
" }\n"+
1055
" }\n";
1056
1057
static void addInitializer(String name, String type, int entriesPerChar,
1058
int bits, int size) {
1059
1060
String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1061
((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1062
if (entriesPerChar == -2) {
1063
template = INT32_INITIALIZER;
1064
}
1065
int marklen = commandMarker.length();
1066
int pos = 0;
1067
while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1068
int newpos = pos + marklen;
1069
char ch = 'x';
1070
while (newpos < template.length() &&
1071
Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1072
ch != '_') // Don't allow this in token names
1073
++newpos;
1074
String token = template.substring(pos+marklen, newpos);
1075
String replacement = "ERROR";
1076
1077
if (token.equals("name")) replacement = name;
1078
else if (token.equals("type")) replacement = type;
1079
else if (token.equals("bits")) replacement = ""+bits;
1080
else if (token.equals("size")) replacement = ""+size;
1081
else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1082
else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1083
else FAIL("Unrecognized token: " + token);
1084
1085
template = template.substring(0, pos) + replacement + template.substring(newpos);
1086
pos += replacement.length();
1087
}
1088
initializers.append(template);
1089
}
1090
1091
/**
1092
* The genTable method generates source code for one lookup table.
1093
* Most of the complexity stems from handling various options as to
1094
* the type of the array components, the precise representation of the
1095
* values, the format in which to render each value, the number of values
1096
* to emit on each line of source code, and the kinds of useful comments
1097
* to be generated.
1098
*
1099
* @param result a StringBuffer, to which the generated source code
1100
* text is to be appended
1101
* @param name the name of the table
1102
* @param table the table data (an array of long values)
1103
* @param extract a distance, in bits, by which each entry of the table
1104
* is to be right-shifted before it is processed
1105
* @param bits the number of bits (not bytes) to be used to represent
1106
* each table entry
1107
* @param size the table data is divided up into blocks of size (1<<size);
1108
* in this method, this information is used only to affect
1109
* how many table values are to be generated per line
1110
* @param preshifted if this flag is true, then the table entries are to be
1111
* emitted in a preshifted form; that is, each value should
1112
* be left-shifted by the amount "shift", so that this work
1113
* is built into the table and need not be performed by an
1114
* explicit shift operator at run time
1115
* @param shift this is the shift amount for preshifting of table entries
1116
* @param hexFormat if this flag is true, table entries should be emitted as
1117
* hexadecimal literals; otherwise decimal literals are used
1118
* @param properties if this flag is true, the table entries are encoded
1119
* character properties rather than indexes into yet other tables;
1120
* therefore comments describing the encoded properties should
1121
* be generated
1122
* @param hexComment if this flag is true, each line of output is labelled with
1123
* a hexadecimal comment indicating the character values to
1124
* which that line applies; otherwise, decimal values indicating
1125
* table indices are generated
1126
*
1127
* @see GenerateCharacter#genTables
1128
* @see GenerateCharacter#replaceCommand
1129
*/
1130
1131
static void genTable(StringBuffer result, String name,
1132
long[] table, int extract, int bits, int size,
1133
boolean preshifted, int shift, boolean hexFormat,
1134
boolean properties, boolean hexComment) {
1135
1136
String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1137
bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1138
bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1139
bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1140
bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1141
bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1142
(Csyntax ? "int64" : "long");
1143
long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1144
bits == 2 ? Integer.MAX_VALUE :
1145
bits == 4 ? Integer.MAX_VALUE :
1146
bits == 8 ? Byte.MAX_VALUE :
1147
bits == 16 ? Short.MAX_VALUE :
1148
bits == 32 ? Integer.MAX_VALUE :
1149
Long.MAX_VALUE;
1150
int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1151
boolean shiftEntries = preshifted && shift != 0;
1152
if (bits == 8 && tableAsString && useCharForByte) {
1153
atype = "char";
1154
maxPosEntry = Character.MAX_VALUE;
1155
entriesPerChar = 1;
1156
}
1157
boolean noConversion = atype.equals("char");
1158
1159
result.append(commentStart);
1160
result.append(" The ").append(name).append(" table has ").append(table.length);
1161
result.append(" entries for a total of ");
1162
int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1163
if (bits == 8 && useCharForByte) {
1164
sizeOfTable *= 2;
1165
}
1166
result.append(sizeOfTable);
1167
result.append(" bytes.").append(commentEnd).append("\n\n");
1168
if (Csyntax)
1169
result.append(" static ");
1170
else
1171
result.append(" static final ");
1172
result.append(atype);
1173
result.append(" ").append(name).append("[");
1174
if (Csyntax)
1175
result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1176
if (tableAsString) {
1177
if (noConversion) {
1178
result.append("] = (\n");
1179
} else {
1180
result.append("] = new ").append(atype).append("["+table.length+"];\n ");
1181
result.append("static final String ").append(name).append("_DATA =\n");
1182
}
1183
int CHARS_PER_LINE = 8;
1184
StringBuffer theString = new StringBuffer();
1185
int entriesInCharSoFar = 0;
1186
char ch = '\u0000';
1187
int charsPerEntry = -entriesPerChar;
1188
for (int j=0; j<table.length; ++j) {
1189
//long entry = table[j] >> extract;
1190
long entry;
1191
if ("A".equals(name))
1192
entry = (table[j] & 0xffffffffL) >> extract;
1193
else
1194
entry = (table[j] >> extract);
1195
if (shiftEntries) entry <<= shift;
1196
if (entry >= (1L << bits)) {
1197
FAIL("Entry too big");
1198
}
1199
if (entriesPerChar > 0) {
1200
// Pack multiple entries into a character
1201
ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1202
++entriesInCharSoFar;
1203
if (entriesInCharSoFar == entriesPerChar) {
1204
// Character is full
1205
theString.append(ch);
1206
entriesInCharSoFar = 0;
1207
ch = '\u0000';
1208
}
1209
}
1210
else {
1211
// Use multiple characters per entry
1212
for (int k=0; k<charsPerEntry; ++k) {
1213
ch = (char)(entry >> ((charsPerEntry-1)*16));
1214
entry <<= 16;
1215
theString.append(ch);
1216
}
1217
}
1218
}
1219
if (entriesInCharSoFar > 0) {
1220
while (entriesInCharSoFar < entriesPerChar) {
1221
ch = (char)((int)ch >> bits);
1222
++entriesInCharSoFar;
1223
}
1224
theString.append(ch);
1225
entriesInCharSoFar = 0;
1226
}
1227
result.append(Utility.formatForSource(theString.toString(), " "));
1228
if (noConversion) {
1229
result.append(").toCharArray()");
1230
}
1231
result.append(";\n\n ");
1232
1233
if (!noConversion) {
1234
addInitializer(name, atype, entriesPerChar, bits, table.length);
1235
}
1236
}
1237
else {
1238
result.append("] = {");
1239
boolean castEntries = shiftEntries && (bits < 32);
1240
int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1241
bits == 2 ? 16*4 :
1242
bits == 4 ? 8*4 :
1243
bits == 8 ? 8 :
1244
bits == 16 ? 8 :
1245
bits == 32 ? 4 : 2) :
1246
(bits == 8 ? 8 :
1247
bits == 16 ? 8 : 4);
1248
int printMask = properties ? 0 :
1249
Math.min(1 << size,
1250
printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1251
int commentShift = ((1 << size) == table.length) ? 0 : size;
1252
int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1253
long val = 0;
1254
for (int j = 0; j < table.length; j++) {
1255
if ((j & printMask) == 0) {
1256
while (result.charAt(result.length() - 1) == ' ')
1257
result.setLength(result.length() - 1);
1258
result.append("\n ");
1259
}
1260
PRINT: {
1261
if (castEntries)
1262
result.append("(").append(atype).append(")(");
1263
long entry = table[j] >> extract;
1264
int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1265
int k = j & packMask;
1266
if (bits >= 8)
1267
val = entry;
1268
else if (k == 0) {
1269
val = entry;
1270
break PRINT;
1271
}
1272
else {
1273
val |= (entry << (k*bits));
1274
if (k != packMask)
1275
break PRINT;
1276
}
1277
if (val > maxPosEntry && !Csyntax) { // liu
1278
// For values that are out of range, convert them to in-range negative values.
1279
// Actually, output the '-' and convert them to the negative of the corresponding
1280
// in-range negative values. E.g., convert 130 == -126 (in 8 bits) -> 126.
1281
result.append('-');
1282
val = maxPosEntry + maxPosEntry + 2 - val;
1283
}
1284
if (hexFormat) {
1285
result.append("0x");
1286
if (bits == 8)
1287
result.append(hex2((byte)val));
1288
else if (bits == 16)
1289
result.append(hex4((short)val));
1290
else if (bits == 32 || bits < 8)
1291
result.append(hex8((int)val));
1292
else {
1293
result.append(hex16(val));
1294
if (!Csyntax)
1295
result.append("L");
1296
}
1297
}
1298
else {
1299
if (bits == 8)
1300
result.append(dec3(val));
1301
else if (bits == 64) {
1302
result.append(dec5(val));
1303
if (!Csyntax)
1304
result.append("L");
1305
}
1306
else
1307
result.append(dec5(val));
1308
}
1309
if (shiftEntries)
1310
result.append("<<").append(shift);
1311
if (castEntries) result.append(")");
1312
if (j < (table.length - 1))
1313
result.append(", ");
1314
else
1315
result.append(" ");
1316
if ((j & printMask) == printMask) {
1317
result.append(" ").append(commentStart).append(" ");
1318
if (hexComment)
1319
result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1320
else
1321
result.append(dec3((j & ~commentMask) >> commentShift));
1322
if (properties) propertiesComments(result, val);
1323
result.append(commentEnd);
1324
}
1325
} // end PRINT
1326
}
1327
result.append("\n };\n\n ");
1328
}
1329
}
1330
1331
static void genCaseMapTableDeclaration(StringBuffer result) {
1332
String myTab = " ";
1333
result.append(myTab + "static final char[][][] charMap;\n");
1334
}
1335
1336
static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1337
String myTab = " ";
1338
int ch;
1339
char[] map;
1340
result.append(myTab + "charMap = new char[][][] {\n");
1341
for (int x = 0; x < specialCaseMaps.length; x++) {
1342
ch = specialCaseMaps[x].getCharSource();
1343
map = specialCaseMaps[x].getUpperCaseMap();
1344
result.append(myTab + myTab);
1345
result.append("{ ");
1346
result.append("{\'\\u"+hex4(ch)+"\'}, {");
1347
for (int y = 0; y < map.length; y++) {
1348
result.append("\'\\u"+hex4(map[y])+"\', ");
1349
}
1350
result.append("} },\n");
1351
}
1352
result.append(myTab + "};\n");
1353
1354
}
1355
1356
/**
1357
* The propertiesComments method generates comments describing encoded
1358
* character properties.
1359
*
1360
* @param result a StringBuffer, to which the generated source code
1361
* text is to be appended
1362
* @param val encoded character properties
1363
*
1364
* @see GenerateCharacter#genTable
1365
*/
1366
1367
static void propertiesComments(StringBuffer result, long val) {
1368
result.append(" ");
1369
switch ((int)(val & maskType)) {
1370
case UnicodeSpec.CONTROL:
1371
result.append("Cc");
1372
break;
1373
case UnicodeSpec.FORMAT:
1374
result.append("Cf");
1375
break;
1376
case UnicodeSpec.PRIVATE_USE:
1377
result.append("Co");
1378
break;
1379
case UnicodeSpec.SURROGATE:
1380
result.append("Cs");
1381
break;
1382
case UnicodeSpec.LOWERCASE_LETTER:
1383
result.append("Ll");
1384
break;
1385
case UnicodeSpec.MODIFIER_LETTER:
1386
result.append("Lm");
1387
break;
1388
case UnicodeSpec.OTHER_LETTER:
1389
result.append("Lo");
1390
break;
1391
case UnicodeSpec.TITLECASE_LETTER:
1392
result.append("Lt");
1393
break;
1394
case UnicodeSpec.UPPERCASE_LETTER:
1395
result.append("Lu");
1396
break;
1397
case UnicodeSpec.COMBINING_SPACING_MARK:
1398
result.append("Mc");
1399
break;
1400
case UnicodeSpec.ENCLOSING_MARK:
1401
result.append("Me");
1402
break;
1403
case UnicodeSpec.NON_SPACING_MARK:
1404
result.append("Mn");
1405
break;
1406
case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1407
result.append("Nd");
1408
break;
1409
case UnicodeSpec.LETTER_NUMBER:
1410
result.append("Nl");
1411
break;
1412
case UnicodeSpec.OTHER_NUMBER:
1413
result.append("No");
1414
break;
1415
case UnicodeSpec.CONNECTOR_PUNCTUATION:
1416
result.append("Pc");
1417
break;
1418
case UnicodeSpec.DASH_PUNCTUATION:
1419
result.append("Pd");
1420
break;
1421
case UnicodeSpec.END_PUNCTUATION:
1422
result.append("Pe");
1423
break;
1424
case UnicodeSpec.OTHER_PUNCTUATION:
1425
result.append("Po");
1426
break;
1427
case UnicodeSpec.START_PUNCTUATION:
1428
result.append("Ps");
1429
break;
1430
case UnicodeSpec.CURRENCY_SYMBOL:
1431
result.append("Sc");
1432
break;
1433
case UnicodeSpec.MODIFIER_SYMBOL:
1434
result.append("Sk");
1435
break;
1436
case UnicodeSpec.MATH_SYMBOL:
1437
result.append("Sm");
1438
break;
1439
case UnicodeSpec.OTHER_SYMBOL:
1440
result.append("So");
1441
break;
1442
case UnicodeSpec.LINE_SEPARATOR:
1443
result.append("Zl"); break;
1444
case UnicodeSpec.PARAGRAPH_SEPARATOR:
1445
result.append("Zp");
1446
break;
1447
case UnicodeSpec.SPACE_SEPARATOR:
1448
result.append("Zs");
1449
break;
1450
case UnicodeSpec.UNASSIGNED:
1451
result.append("unassigned");
1452
break;
1453
}
1454
1455
switch ((int)((val & maskBidi) >> shiftBidi)) {
1456
case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1457
result.append(", L");
1458
break;
1459
case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1460
result.append(", R");
1461
break;
1462
case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1463
result.append(", EN");
1464
break;
1465
case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1466
result.append(", ES");
1467
break;
1468
case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1469
result.append(", ET");
1470
break;
1471
case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1472
result.append(", AN");
1473
break;
1474
case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1475
result.append(", CS");
1476
break;
1477
case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1478
result.append(", B");
1479
break;
1480
case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1481
result.append(", S");
1482
break;
1483
case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1484
result.append(", WS");
1485
break;
1486
case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1487
result.append(", ON");
1488
break;
1489
}
1490
if ((val & maskUpperCase) != 0) {
1491
result.append(", hasUpper (subtract ");
1492
result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1493
}
1494
if ((val & maskLowerCase) != 0) {
1495
result.append(", hasLower (add ");
1496
result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1497
}
1498
if ((val & maskTitleCase) != 0) {
1499
result.append(", hasTitle");
1500
}
1501
if ((val & maskIdentifierInfo) == valueIgnorable) {
1502
result.append(", ignorable");
1503
}
1504
if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1505
result.append(", identifier part");
1506
}
1507
if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1508
result.append(", underscore");
1509
}
1510
if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1511
result.append(", whitespace");
1512
}
1513
if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1514
result.append(", currency");
1515
}
1516
if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1517
result.append(", identifier start");
1518
}
1519
if ((val & maskNumericType) == valueDigit) {
1520
result.append(", decimal ");
1521
result.append((val & maskDigitOffset) >> shiftDigitOffset);
1522
}
1523
if ((val & maskNumericType) == valueStrangeNumeric) {
1524
result.append(", strange");
1525
}
1526
if ((val & maskNumericType) == valueJavaSupradecimal) {
1527
result.append(", supradecimal ");
1528
result.append((val & maskDigitOffset) >> shiftDigitOffset);
1529
}
1530
}
1531
1532
static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1533
1534
static String tableName(int j) { return tableNames[j]; }
1535
1536
/**
1537
* The genAccess method generates source code for one table access expression.
1538
*
1539
* Most of the complexity stems from handling various options as to
1540
* table representation, such as whether it contains values so large that
1541
* they are represented as negative values and whether the table values are
1542
* preshifted. This method also avoids such "ugly" expressions as shifting
1543
* by distance zero, masking when no masking is necessary, and so on.
1544
* For clarity, it generates expressions that do not rely on operator
1545
* precedence, but otherwise it avoids generating redundant parentheses.
1546
*
1547
* A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1548
* or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1549
*
1550
* @param tbl the name of the final table to be accessed
1551
* @param var the variable name that appeared in parentheses in the
1552
* "Lookup" command
1553
* @param bits the number of bits (not bytes) to be used to represent
1554
* the final table entry
1555
* @return the replacement text for the "Lookup(xxx)" command, as a String
1556
*
1557
* @see GenerateCharacter#replaceCommand
1558
*/
1559
1560
static String genAccess(String tbl, String var, int bits) {
1561
String access = null;
1562
int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1563
for (int k = 0; k < sizes.length; k++) {
1564
int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1565
int shift = shifts[k] + offset;
1566
String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1567
int mask = (1 << (sizes[k] - offset)) - 1;
1568
String masked = (k == 0) ? shifted :
1569
"(" + shifted + "&0x" + hex(mask) + ")";
1570
String index = (k == 0) ? masked :
1571
(mask == 0) ? access : "(" + access + "|" + masked + ")";
1572
String indexNoParens = (index.charAt(0) != '(') ? index :
1573
index.substring(1, index.length() - 1);
1574
String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1575
String fetched = tblname + "[" + indexNoParens + "]";
1576
String zeroextended = (zeroextend[k] == 0) ? fetched :
1577
"(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1578
int adjustment = preshifted[k] ? 0 :
1579
sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1580
String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1581
"(" + zeroextended + "<<" + adjustment + ")";
1582
String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1583
(bits == 2) ? "((" + var + "&0xF)<<1)" :
1584
(bits == 4) ? "((" + var + "&7)<<2)" : null;
1585
String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1586
"((" + adjusted + ">>" + bitshift + ")&" +
1587
(bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1588
access = extracted;
1589
}
1590
return access;
1591
}
1592
1593
/* The command line arguments are decoded and used to set the following
1594
global variables.
1595
*/
1596
1597
static boolean verbose = false;
1598
static boolean nobidi = false;
1599
static boolean nomirror = false;
1600
static boolean identifiers = false;
1601
static boolean Csyntax = false;
1602
static String TemplateFileName = null;
1603
static String OutputFileName = null;
1604
static String UnicodeSpecFileName = null; // liu
1605
static String SpecialCasingFileName = null;
1606
static String PropListFileName = null;
1607
static boolean useCharForByte = false;
1608
static int[] sizes;
1609
static int bins = 0; // liu; if > 0, then perform search
1610
static boolean tableAsString = false;
1611
static boolean bLatin1 = false;
1612
1613
static String commandLineDescription;
1614
1615
/* Other global variables, equal in length to the "sizes" array. */
1616
1617
static int[] shifts;
1618
static int[] zeroextend;
1619
static int[] bytes;
1620
static boolean[] preshifted;
1621
static long[][] tables;
1622
1623
1624
/* Other global variables */
1625
static String commentStart;
1626
static String commentEnd;
1627
1628
static StringBuffer initializers = new StringBuffer();
1629
1630
/* special casing rules for 1:M toUpperCase mappings */
1631
static SpecialCaseMap[] specialCaseMaps;
1632
1633
/**
1634
* Process the command line arguments.
1635
*
1636
* The allowed flags in command line are:
1637
* <dl>
1638
* <dt> -verbose <dd> Emit comments to standard output describing
1639
* what's going on during the processing.
1640
* <dt> -nobidi <dd> Do not include bidi categories in the
1641
* encoded character properties.
1642
* <dt> -nomirror <dd> Do no include mirror property in the encoded
1643
* character properties.
1644
* <dt> -identifiers <dd> Generate tables for scanning identifiers only.
1645
* <dt> -c <dd> Output code in C syntax instead of Java syntax.
1646
* <dt> -o filename <dd> Specify output file name.
1647
* <dt> -template filename <dd> Specify template input file name.
1648
* <dt> -spec filename <dd> Specify Unicode spec file name.
1649
* <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1650
* <dt> -search bins <dd> Try different partitions into the specified
1651
* number of bins. E.g., for 2 bins, try
1652
* 16 0, 15 1,..., 0 16.
1653
* <dt> -string <dd> Create table as string. Only valid with Java
1654
* syntax.
1655
* <dt> -latin1 <dd> Create a latin 1 only property table.
1656
* </dl>
1657
* In addition, decimal literals may appear as command line arguments;
1658
* each one represents the number of bits of the character to be broken
1659
* off at each lookup step. If present, they must add up to 16 (the number
1660
* of bits in a char value). For smaller tables, the last value should
1661
* be 0; values other than the last one may not be zero. If no such
1662
* numeric values are provided, default values are used.
1663
*
1664
* @param args the command line arguments, as an array of String
1665
*
1666
* @see GenerateCharacter#main
1667
*/
1668
1669
static void processArgs(String[] args) {
1670
StringBuffer desc = new StringBuffer("java GenerateCharacter");
1671
for (int j=0; j<args.length; ++j) {
1672
desc.append(" " + args[j]);
1673
}
1674
for (int j = 0; j < args.length; j++) {
1675
if (args[j].equals("-verbose") || args[j].equals("-v"))
1676
verbose = true;
1677
else if (args[j].equals("-nobidi"))
1678
nobidi = true;
1679
else if (args[j].equals("-nomirror"))
1680
nomirror = true;
1681
else if (args[j].equals("-identifiers"))
1682
identifiers = true;
1683
else if (args[j].equals("-c"))
1684
Csyntax = true;
1685
else if (args[j].equals("-string"))
1686
tableAsString = true;
1687
else if (args[j].equals("-o")) {
1688
if (j == args.length - 1) {
1689
FAIL("File name missing after -o");
1690
}
1691
else {
1692
OutputFileName = args[++j];
1693
}
1694
}
1695
else if (args[j].equals("-search")) {
1696
if (j == args.length - 1)
1697
FAIL("Bin count missing after -search");
1698
else {
1699
bins = Integer.parseInt(args[++j]);
1700
if (bins < 1 || bins > 10)
1701
FAIL("Bin count must be >= 1 and <= 10");
1702
}
1703
}
1704
else if (args[j].equals("-template")) {
1705
if (j == args.length - 1)
1706
FAIL("File name missing after -template");
1707
else
1708
TemplateFileName = args[++j];
1709
}
1710
else if (args[j].equals("-spec")) { // liu
1711
if (j == args.length - 1) {
1712
FAIL("File name missing after -spec");
1713
}
1714
else {
1715
UnicodeSpecFileName = args[++j];
1716
}
1717
}
1718
else if (args[j].equals("-specialcasing")) {
1719
if (j == args.length -1) {
1720
FAIL("File name missing after -specialcasing");
1721
}
1722
else {
1723
SpecialCasingFileName = args[++j];
1724
}
1725
}
1726
else if (args[j].equals("-proplist")) {
1727
if (j == args.length -1) {
1728
FAIL("File name missing after -proplist");
1729
}
1730
else {
1731
PropListFileName = args[++j];
1732
}
1733
}
1734
else if (args[j].equals("-plane")) {
1735
if (j == args.length -1) {
1736
FAIL("Plane number missing after -plane");
1737
}
1738
else {
1739
plane = Integer.parseInt(args[++j]);
1740
}
1741
if (plane > 0) {
1742
bLatin1 = false;
1743
}
1744
}
1745
else if ("-usecharforbyte".equals(args[j])) {
1746
useCharForByte = true;
1747
}
1748
else if (args[j].equals("-latin1")) {
1749
bLatin1 = true;
1750
plane = 0;
1751
}
1752
else {
1753
try {
1754
int val = Integer.parseInt(args[j]);
1755
if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1756
if (sizes == null)
1757
sizes = new int[1];
1758
else {
1759
int[] newsizes = new int[sizes.length + 1];
1760
System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1761
sizes = newsizes;
1762
}
1763
sizes[sizes.length - 1] = val;
1764
}
1765
catch(NumberFormatException e) {
1766
FAIL("Unknown switch: " + args[j]);
1767
}
1768
}
1769
}
1770
if (Csyntax && tableAsString) {
1771
FAIL("Can't specify table as string with C syntax");
1772
}
1773
if (sizes == null) {
1774
desc.append(" [");
1775
if (identifiers) {
1776
int[] newsizes = { 8, 4, 4 }; // Good default values
1777
desc.append("8 4 4]");
1778
sizes = newsizes;
1779
}
1780
else {
1781
int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1782
desc.append("10 5 1]");
1783
sizes = newsizes;
1784
}
1785
}
1786
if (UnicodeSpecFileName == null) { // liu
1787
UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1788
desc.append(" [-spec " + UnicodeSpecFileName + ']');
1789
}
1790
if (SpecialCasingFileName == null) {
1791
SpecialCasingFileName = DefaultSpecialCasingFileName;
1792
desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1793
}
1794
if (PropListFileName == null) {
1795
PropListFileName = DefaultPropListFileName;
1796
desc.append(" [-proplist " + PropListFileName + ']');
1797
}
1798
if (TemplateFileName == null) {
1799
TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1800
: DefaultJavaTemplateFileName);
1801
desc.append(" [-template " + TemplateFileName + ']');
1802
}
1803
if (OutputFileName == null) {
1804
OutputFileName = (Csyntax ? DefaultCOutputFileName
1805
: DefaultJavaOutputFileName);
1806
desc.append(" [-o " + OutputFileName + ']');
1807
}
1808
commentStart = (Csyntax ? "/*" : "//");
1809
commentEnd = (Csyntax ? " */" : "");
1810
commandLineDescription = desc.toString();
1811
}
1812
1813
private static void searchBins(long[] map, int binsOccupied) throws Exception {
1814
int bitsFree = 16;
1815
for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1816
if (binsOccupied == (bins-1)) {
1817
sizes[binsOccupied] = bitsFree;
1818
generateForSizes(map);
1819
}
1820
else {
1821
for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1822
sizes[binsOccupied] = i;
1823
searchBins(map, binsOccupied+1);
1824
}
1825
}
1826
}
1827
1828
private static void generateForSizes(long[] map) throws Exception {
1829
int sum = 0;
1830
shifts = new int[sizes.length];
1831
for (int k = sizes.length - 1; k >= 0; k--) {
1832
shifts[k] = sum;
1833
sum += sizes[k];
1834
}
1835
if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1836
FAIL("Bit field widths total to " + sum +
1837
": wrong total for map of size " + map.length);
1838
}
1839
// need a table for each set of lookup bits in char
1840
tables = new long[sizes.length][];
1841
// the last table is the map
1842
tables[sizes.length - 1] = map;
1843
for (int j = sizes.length - 1; j > 0; j--) {
1844
if (verbose && bins==0)
1845
System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1846
long[][] temp = buildTable(tables[j], sizes[j]);
1847
tables[j-1] = temp[0];
1848
tables[j] = temp[1];
1849
}
1850
preshifted = new boolean[sizes.length];
1851
zeroextend = new int[sizes.length];
1852
bytes = new int[sizes.length];
1853
for (int j = 0; j < sizes.length - 1; j++) {
1854
int len = tables[j+1].length;
1855
int size = sizes[j+1];
1856
if (len > 0x100 && (len >> size) <= 0x100) {
1857
len >>= size;
1858
preshifted[j] = false;
1859
}
1860
else if (len > 0x10000 && (len >> size) <= 0x10000) {
1861
len >>= size;
1862
preshifted[j] = false;
1863
}
1864
else preshifted[j] = true;
1865
if (Csyntax)
1866
zeroextend[j] = 0;
1867
else if (len > 0x7F && len <= 0xFF) {
1868
if (!useCharForByte) {
1869
zeroextend[j] = 0xFF;
1870
}
1871
} else if (len > 0x7FFF && len <= 0xFFFF)
1872
zeroextend[j] = 0xFFFF;
1873
else zeroextend[j] = 0;
1874
if (len <= 0x100) bytes[j] = 1;
1875
else if (len <= 0x10000) bytes[j] = 2;
1876
else bytes[j] = 4;
1877
}
1878
preshifted[sizes.length - 1] = true;
1879
zeroextend[sizes.length - 1] = 0;
1880
bytes[sizes.length - 1] = 0;
1881
if (bins > 0) {
1882
int totalBytes = getTotalBytes();
1883
String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1884
int accessComplexity = 0;
1885
for (int j=0; j<access.length(); ++j) {
1886
char ch = access.charAt(j);
1887
if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1888
if (ch == '<' || ch == '>') ++j;
1889
}
1890
System.out.print("(");
1891
for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1892
System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1893
return;
1894
}
1895
if (verbose) {
1896
System.out.println(" n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1897
for (int j = 0; j < sizes.length; j++) {
1898
System.out.println(dec5(j) + "\t" +
1899
dec5(sizes[j]) + "\t" +
1900
dec5(tables[j].length) + "\t" +
1901
dec5(shifts[j]) + "\t" +
1902
dec5(zeroextend[j]) + "\t" +
1903
dec5(bytes[j]) + "\t " +
1904
preshifted[j]);
1905
}
1906
}
1907
if (verbose) {
1908
System.out.println("Generating source code for class Character");
1909
System.out.println("A table access looks like " +
1910
genAccess("A", "ch", (identifiers ? 2 : 32)));
1911
}
1912
generateCharacterClass(TemplateFileName, OutputFileName);
1913
}
1914
1915
/**
1916
* The main program for generating source code for the Character class.
1917
* The basic outline of its operation is:
1918
* <ol>
1919
* <li> Process the command line arguments. One result of this process
1920
* is a list of sizes (measured in bits and summing to 16).
1921
* <li> Get the Unicode character property data from the specification file.
1922
* <li> From that, build a map that has, for each character code, its
1923
* relevant properties encoded as a long integer value.
1924
* <li> Repeatedly compress the map, producing a compressed table and a
1925
* new map. This is done once for each size value in the list.
1926
* When this is done, we have a set of tables.
1927
* <li> Make some decisions about table representation; record these
1928
* decisions in arrays named preshifted, zeroextend, and bytes.
1929
* <li> Generate the source code for the class Character by performing
1930
* macro processing on a template file.
1931
* </ol>
1932
*
1933
* @param args the command line arguments, as an array of String
1934
*
1935
* @see GenerateCharacter#processArgs
1936
* @see UnicodeSpec@readSpecFile
1937
* @see GenerateCharacter#buildMap
1938
* @see GenerateCharacter#buildTable
1939
* @see GenerateCharacter#generateCharacterClass
1940
*/
1941
1942
public static void main(String[] args) {
1943
processArgs(args);
1944
try {
1945
1946
UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1947
specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1948
PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1949
1950
if (verbose) {
1951
System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1952
}
1953
long[] map = buildMap(data, specialCaseMaps, propList);
1954
if (verbose) {
1955
System.err.println("Completed building of initial map");
1956
}
1957
1958
if (bins == 0) {
1959
generateForSizes(map);
1960
}
1961
else {
1962
while (bins > 0) {
1963
sizes = new int[bins];
1964
searchBins(map, 0);
1965
--bins;
1966
}
1967
}
1968
if (verbose && false) {
1969
System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
1970
hex8(maxOffsetSeen));
1971
System.out.println(" allowed: -" + hex8(-minOffset) + "..+" +
1972
hex8(maxOffset));
1973
}
1974
}
1975
catch (FileNotFoundException e) { FAIL(e.toString()); }
1976
catch (IOException e) { FAIL(e.toString()); }
1977
catch (Throwable e) {
1978
System.out.println("Unexpected exception:");
1979
e.printStackTrace();
1980
FAIL("Unexpected exception!");
1981
}
1982
if (verbose) { System.out.println("Done!");}
1983
}
1984
1985
} // end class
1986
1987