CoCalc -- GenerateCharacter.java

GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/make/src/classes/build/tools/generatecharacter/GenerateCharacter.java
³²²⁸⁷ views
1
/*
2
 * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.  Oracle designates this
8
 * particular file as subject to the "Classpath" exception as provided
9
 * by Oracle in the LICENSE file that accompanied this code.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 */
25

26
package build.tools.generatecharacter;
27

28
import java.io.IOException;
29
import java.io.FileNotFoundException;
30
import java.io.BufferedReader;
31
import java.io.FileReader;
32
import java.io.PrintWriter;
33
import java.io.BufferedWriter;
34
import java.io.FileWriter;
35
import java.io.File;
36
import java.util.List;
37

38
import build.tools.generatecharacter.CharacterName;
39

40
/**
41
 * This program generates the source code for the class java.lang.Character.
42
 * It also generates native C code that can perform the same operations.
43
 * It requires two external input data files:
44
 * <ul>
45
 * <li> Unicode specification file
46
 * <li> Character class template file
47
 * </ul>
48
 * The Unicode specification file is available from the Unicode consortium.
49
 * It has character specification lines that look like this:
50
 * <listing>
51
 * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
52
 * </listing>
53
 * The Character class template file is filled in with additional
54
 * information to produce the file Character.java, which can then be
55
 * compiled by a Java compiler.  The template file contains certain
56
 * markers consisting of an alphabetic name string preceded by "$$".
57
 * Such markers are replaced with generated program text.  As a special
58
 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
59
 * alphabetic characters constituting a variable name.  The character "_"
60
 * is considered alphabetic for these purposes.
61
 *
62
 * @author  Guy Steele
63
 * @author  Alan Liu
64
 * @author  John O'Conner
65
 */
66

67
public class GenerateCharacter {
68

69
    final static boolean DEBUG = false;
70

71
    final static String commandMarker = "$$";
72
    static String ROOT                        = "";
73
    static String DefaultUnicodeSpecFileName  = ROOT + "UnicodeData.txt";
74
    static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
75
    static String DefaultPropListFileName     = ROOT + "PropList.txt";
76
    static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
77
    static String DefaultJavaOutputFileName   = ROOT + "Character.java";
78
    static String DefaultCTemplateFileName    = ROOT + "Character.c.template";
79
    static String DefaultCOutputFileName      = ROOT + "Character.c";
80

81
    static int plane = 0;
82

83
    /* The overall idea is that, in the generated Character class source code,
84
    most character property data is stored in a special multi-level table whose
85
    structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
86
    The integers must sum to 16 (the number of bits in a character).
87
    The first table is indexed by the k1 high-order bits of the character code.
88
    The result is concatenated to the next k2 bits of the character code to index
89
    the second table, and so on.  Eventually the kn low-order bits of the character
90
    code are concatenated and used to index one of two tables A and B; A contains
91
    32-bit integer entries and B contains 16-bit short entries.  The 48 bits that
92
    can be thus obtained encode the properties for the character.
93

94
    The default specification is [9, 4, 3, 0].  This particular table format was
95
    designed by conducting an exhaustive search of table formats to minimize the
96
    space consumed by the tables: the first and third tables need have only byte
97
    values (the second table must have short values).  Another good choice is
98
    [10, 6, 0], which produces a larger table but allows particularly fast table
99
    lookup code.
100

101
    In each case, where the word "concatenated" is used, this may imply
102
    first a << and then a | operation, or perhaps just a | operation if
103
    the values in the table can be preshifted (generally possible if the table
104
    entries are short rather than byte).
105
    */
106

107
    /* The character properties are currently encoded into A (32 bits)and B (16 bits)
108
       two parts.
109

110
    A: the low 32 bits are defined  in the following manner:
111

112
    1 bit Mirrored property.
113
    4 bits      Bidirectional category (see below) (unused if -nobidi switch specified)
114
    9 bits      A signed offset used for converting case .
115
    1 bit       If 1, adding the signed offset converts the character to lowercase.
116
    1 bit       If 1, subtracting the signed offset converts the character to uppercase.
117
        Note: for a titlecase character, both of the preceding bits will be 1
118
        and the signed offset will be 1.
119
    1 bit   If 1, this character has a titlecase equivalent (possibly itself);
120
        in this case, the two bits before this bit can be used to decide
121
        whether this character is in fact uppercase, lowercase, or titlecase.
122
    3 bits      This field provides a quick way to lex identifiers.
123
        The eight possible values for this field are as follows:
124
        0  May not be part of an identifier
125
        1  Ignorable control; may continue a Unicode identifier or Java identifier
126
        2  May continue a Java identifier but not a Unicode identifier (unused)
127
        3  May continue a Unicode identifier or Java identifier
128
        4  Is a Java whitespace character
129
        5  May start or continue a Java identifier;
130
           may continue but not start a Unicode identifier
131
           (this value is used for connector punctuation such as _)
132
        6  May start or continue a Java identifier;
133
           may not occur in a Unicode identifier
134
           (this value is used for currency symbols such as $)
135
        7  May start or continue a Unicode identifier or Java identifier
136
        Thus:
137
           5, 6, 7 may start a Java identifier
138
           1, 2, 3, 5, 6, 7 may continue a Java identifier
139
           7 may start a Unicode identifier
140
           1, 3, 5, 7 may continue a Unicode identifier
141
           1 is ignorable within an identifier
142
           4 is Java whitespace
143
    2 bits      This field indicates whether the character has a numeric property.
144
        The four possible values for this field are as follows:
145
        0  This character has no numeric property.
146
        1  Adding the digit offset to the character code and then
147
           masking with 0x1F will produce the desired numeric value.
148
        2  This character has a "strange" numeric value.
149
        3  A Java supradecimal digit: adding the digit offset to the
150
           character code, then masking with 0x1F, then adding 10
151
           will produce the desired numeric value.
152
    5 bits  The digit offset (see description of previous field)
153
    5 bits      Character type (see below)
154

155
    B: the high 16 bits are defined as:
156
    1 bit Other_Lowercase property
157
    1 bit Other_Uppercase property
158
    1 bit Other_Alphabetic property
159
    1 bit Other_Math property
160
    1 bit Ideographic property
161
    1 bit Noncharacter codepoint property
162
    */
163

164

165
    // bit masks identify each component of a 32-bit property field described
166
    // above.
167
    // shift* indicates how many shifts right must happen to get the
168
    // indicated property value in the lowest bits of the 32-bit space.
169
    private static final int
170
        shiftType           = 0,        maskType            =       0x001F,
171
        shiftDigitOffset    = 5,        maskDigitOffset     =       0x03E0,
172
        shiftNumericType    = 10,       maskNumericType     =       0x0C00,
173
        shiftIdentifierInfo = 12,       maskIdentifierInfo  =       0x7000,
174
                                        maskUnicodePart     =       0x1000,
175
        shiftCaseInfo       = 15,       maskCaseInfo        =      0x38000,
176
                                        maskLowerCase       =      0x20000,
177
                                        maskUpperCase       =      0x10000,
178
                                        maskTitleCase       =      0x08000,
179
        shiftCaseOffset     = 18,       maskCaseOffset      =   0x07FC0000,
180
        shiftCaseOffsetSign = 5,
181
                                        // used only when calculating and
182
                                        // storing digit offsets from char values
183
                                        maskDigit               =   0x001F,
184
                                        // case offset are 9 bits
185
                                        maskCase                =   0x01FF,
186
        shiftBidi           = 27,       maskBidi              = 0x78000000,
187
        shiftMirrored       = 31,       //maskMirrored          = 0x80000000,
188
        shiftPlane          = 16,       maskPlane = 0xFF0000;
189

190
    // maskMirrored needs to be long, if up 16-bit
191
    private static final long maskMirrored          = 0x80000000L;
192

193
    // bit masks identify the 16-bit priperty field described above, in B
194
    // table
195
    private static final long
196
        maskOtherLowercase  = 0x100000000L,
197
        maskOtherUppercase  = 0x200000000L,
198
        maskOtherAlphabetic = 0x400000000L,
199
        maskOtherMath       = 0x800000000L,
200
        maskIdeographic     = 0x1000000000L,
201
        maskNoncharacterCP  = 0x2000000000L;
202

203
    // Can compare masked values with these to determine
204
    // numeric or lexical types.
205
    public static int
206
        valueNotNumeric             = 0x0000,
207
        valueDigit                  = 0x0400,
208
        valueStrangeNumeric         = 0x0800,
209
        valueJavaSupradecimal       = 0x0C00,
210
        valueIgnorable              = 0x1000,
211
        valueJavaOnlyPart           = 0x2000,
212
        valueJavaUnicodePart        = 0x3000,
213
        valueJavaWhitespace         = 0x4000,
214
        valueJavaStartUnicodePart   = 0x5000,
215
        valueJavaOnlyStart          = 0x6000,
216
        valueJavaUnicodeStart       = 0x7000,
217
        lowJavaStart                = 0x5000,
218
        nonzeroJavaPart             = 0x3000,
219
        valueUnicodeStart           = 0x7000;
220

221
    // these values are used when only identifier properties are generated
222
    // for use in verifier code. Shortens the property down to a single byte.
223
    private static final int
224
        bitJavaStart            = 0x02,
225
        bitJavaPart             = 0x01,
226
        maskIsJavaIdentifierPart = bitJavaPart,
227
        maskIsJavaIdentifierStart = bitJavaStart;
228

229
    static int maxOffset = maskCase/2 ;
230
    static int minOffset = -maxOffset;
231

232
    /* The following routines provide simple, concise formatting of long integer values.
233
     The number in the name of the method indicates the desired number of characters
234
     to be produced.  If the number of digits required to represent the integer value
235
     is less than that number, then the output is padded on the left  with zeros
236
     (for hex) or with spaces (for decimal).  If the number of digits required to
237
     represent the integer value is greater than the desired number, then all the digits
238
     that are required are actually produced.
239
    */
240

241
    static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
242

243
    static String hex2(long n) {
244
        String q = Long.toHexString(n & 0xFF).toUpperCase();
245
        return "00".substring(Math.min(2, q.length())) + q;
246
    }
247

248
    static String hex4(long n) {
249
        String q = Long.toHexString(n & 0xFFFF).toUpperCase();
250
        return "0000".substring(Math.min(4, q.length())) + q;
251
    }
252

253
    static String hex8(long n) {
254
        String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
255
        return "00000000".substring(Math.min(8, q.length())) + q;
256
    }
257

258
    static String hex16(long n) {
259
        String q = Long.toHexString(n).toUpperCase();
260
        return "0000000000000000".substring(Math.min(16, q.length())) + q;
261
    }
262

263
    static String dec3(long n) {
264
        String q = Long.toString(n);
265
        return "   ".substring(Math.min(3, q.length())) + q;
266
    }
267

268
    static String dec5(long n) {
269
        String q = Long.toString(n);
270
        return "     ".substring(Math.min(5, q.length())) + q;
271
    }
272

273
    /* This routine is called when some failure occurs. */
274

275
    static void FAIL(String s) {
276
        System.out.println("** " + s);
277
    }
278

279
    /**
280
    * Given the data from the Unicode specification file, this routine builds a map.
281
    *
282
    * The specification file is assumed to contain its data in sorted order by
283
    * character code; as a result, the array passed as an argument to this method
284
    * has its components in the same sorted order, with one entry for each defined
285
    * Unicode character or character range.  (A range is indicated by two consecutive
286
    * entries, such that the name of the first entry begins with "<" and ends with
287
    * "First>" and the second entry begins with "<" and ends with "Last>".)  This is
288
    * therefore a sparse representation of the character property data.
289
    *
290
    * The resulting map is dense representation of the character data.  It contains
291
    * 2^16 = 65536 entries, each of which is a long integer.  (Right now only 32 bits
292
    * of this long value are used, but type long is used rather than int to facilitate
293
    * future extensions of this source code generator that might require more than
294
    * 32 bits to encode relevant character properties.)  Entry k holds the encoded
295
    * properties for character k.
296
    *
297
    * Method buildMap manages the transformation from the sparse representation to
298
    * the dense representation.  It calls method buildOne to handle the encoding
299
    * of character property data from a single UnicodeSpec object into 32 bits.
300
    * For undefined characters, method buildOne is not called and the map entry for
301
    * that character is set to UnicodeSpec.UNASSIGNED.
302
    *
303
    * @param data       character property data from the Unicode specification file
304
    * @return   an array of length 65536 with one entry for every possible char value
305
    *
306
    * @see GenerateCharacter#buildOne
307
    */
308

309
    static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
310
    {
311
        long[] result;
312
        if (bLatin1 == true) {
313
            result = new long[256];
314
        } else {
315
            result = new long[1<<16];
316
        }
317
        int k=0;
318
        int codePoint = plane<<16;
319
        UnicodeSpec nonCharSpec = new UnicodeSpec();
320
        for (int j = 0; j < data.length && k < result.length; j++) {
321
            if (data[j].codePoint == codePoint) {
322
                result[k] = buildOne(codePoint, data[j], specialMaps);
323
                ++k;
324
                ++codePoint;
325
            }
326
            else if(data[j].codePoint > codePoint) {
327
                if (data[j].name.endsWith("Last>")) {
328
                    // build map data for all chars except last in range
329
                    while (codePoint < data[j].codePoint && k < result.length) {
330
                        result[k] = buildOne(codePoint, data[j], specialMaps);
331
                        ++k;
332
                        ++codePoint;
333
                    }
334
                }
335
                else {
336
                    // we have a few unassigned chars before data[j].codePoint
337
                    while (codePoint < data[j].codePoint && k < result.length) {
338
                        result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
339
                        ++k;
340
                        ++codePoint;
341
                    }
342
                }
343
                k = data[j].codePoint & 0xFFFF;
344
                codePoint = data[j].codePoint;
345
                result[k] = buildOne(codePoint, data[j], specialMaps);
346
                ++k;
347
                ++codePoint;
348
            }
349
            else {
350
                System.out.println("An error has occured during spec mapping.");
351
                System.exit(0);
352
            }
353
        }
354
        // if there are still unprocessed chars, process them
355
        // as unassigned/undefined.
356
        codePoint = (plane<<16) | k;
357
        while (k < result.length) {
358
            result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
359
            ++k;
360
            ++codePoint;
361
        }
362
        // now add all extra supported properties from PropList, to the
363
        // upper 16-bit
364
        addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
365
        addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
366
        addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
367
        addExProp(result, propList, "Ideographic", maskIdeographic);
368
        //addExProp(result, propList, "Other_Math", maskOtherMath);
369
        //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
370

371
        return result;
372
    }
373

374
    // The maximum and minimum offsets found while scanning the database
375
    static int maxOffsetSeen = 0;
376
    static int minOffsetSeen = 0;
377

378
    /**
379
     * Some Unicode separator characters are not considered Java whitespace.
380
     * @param c character to test
381
     * @return true if c in an invalid Java whitespace character, false otherwise.
382
     */
383
    static boolean isInvalidJavaWhiteSpace(int c) {
384
        int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
385
        boolean retValue = false;
386
        for(int x=0;x<exceptions.length;x++) {
387
            if(c == exceptions[x]) {
388
                retValue = true;
389
                break;
390
            }
391
        }
392
        return retValue;
393

394
    }
395

396
    /**
397
    * Given the character property data for one Unicode character, encode the data
398
    * of interest into a single long integer value.  (Right now only 32 bits
399
    * of this long value are used, but type long is used rather than int to facilitate
400
    * future extensions of this source code generator that might require more than
401
    * 32 bits to encode relevant character properties.)
402
    *
403
    * @param c   the character code for which to encode property data
404
    * @param us  property data record from the Unicode specification file
405
    *            (its character code might not be equal to c if it specifies data
406
    *            for a range of characters)
407
    * @return   an encoded long value that contains the properties for a single char
408
    *
409
    * @see GenerateCharacter#buildMap
410
    */
411

412
    static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
413
        long resultA = 0;
414
        // record the general category
415
        resultA |= us.generalCategory;
416

417
        // record the numeric properties
418
        NUMERIC: {
419
        STRANGE: {
420
            int val = 0;
421
            // c is A-Z
422
            if ((c >= 0x0041) && (c <= 0x005A)) {
423
                val = c - 0x0041;
424
                resultA |= valueJavaSupradecimal;
425
            // c is a-z
426
            } else if ((c >= 0x0061) && (c <= 0x007A)) {
427
                val = c - 0x0061;
428
                resultA |= valueJavaSupradecimal;
429
            // c is a full-width A-Z
430
            } else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
431
                val = c - 0xFF21;
432
                resultA |= valueJavaSupradecimal;
433
            // c is a full-width a-z
434
            } else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
435
                val = c - 0xFF41;
436
                resultA |= valueJavaSupradecimal;
437
            } else if (us.isDecimalValue()) {
438
                val = us.decimalValue;
439
                resultA |= valueDigit;
440
            } else if (us.isDigitValue()) {
441
                val = us.digitValue;
442
                resultA |= valueDigit;
443
            } else {
444
                if (us.numericValue.length() == 0) {
445
                    break NUMERIC;                      // no numeric value at all
446
                } else {
447
                    try {
448
                        val = Integer.parseInt(us.numericValue);
449
                        if (val >= 32 || val < 0) break STRANGE;
450
                        if (c == 0x215F) break STRANGE;
451
                    } catch(NumberFormatException e) {
452
                        break STRANGE;
453
                    }
454
                    resultA |= valueDigit;
455
                }
456
            }
457
            if (val >= 32 || val < 0) break STRANGE;
458
            resultA |= ((val - c & maskDigit) << shiftDigitOffset);
459
            break NUMERIC;
460
        } // end STRANGE
461
        resultA |= valueStrangeNumeric;
462
        } // end NUMERIC
463

464
        // record case mapping
465
        int offset = 0;
466
        // might have a 1:M mapping
467
        int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
468
        boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
469
        if (bHasUpper) {
470
            resultA |= maskUpperCase;
471
        }
472
        if (specialMap != -1) {
473
            // has mapping, but cannot record the
474
            // proper offset; can only flag it and provide special case
475
            // code in Character.java
476
            offset = -1;
477
        }
478
        else if (us.hasUpperMap())  {
479
            offset = c - us.upperMap;
480
        }
481

482
        if (us.hasLowerMap()) {
483
            resultA |= maskLowerCase;
484
            if (offset == 0)
485
                offset = us.lowerMap - c;
486
            else if (offset != (us.lowerMap - c)) {
487
                if (DEBUG) {
488
                FAIL("Character " + hex(c) +
489
                " has incompatible lowercase and uppercase mappings");
490
                }
491
            }
492
        }
493
        if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
494
            (bHasUpper && us.hasLowerMap())) {
495
            resultA |= maskTitleCase;
496
        }
497
        if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
498
            System.out.println("Warning: Character " + hex4(c) + " has upper but " +
499
                               "no title case; Java won't know this");
500
        }
501
        if (offset < minOffsetSeen) minOffsetSeen = offset;
502
        if (offset > maxOffsetSeen) maxOffsetSeen = offset;
503
        if (offset > maxOffset || offset < minOffset) {
504
            if (DEBUG) {
505
            FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
506
            }
507
            offset = maskCase;
508
        }
509
        resultA |= ((offset & maskCase) << shiftCaseOffset);
510

511
        // record lexical info about this character
512
        if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
513
                || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
514
                || us.generalCategory == UnicodeSpec.TITLECASE_LETTER
515
                || us.generalCategory == UnicodeSpec.MODIFIER_LETTER
516
                || us.generalCategory == UnicodeSpec.OTHER_LETTER
517
                || us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
518
            resultA |= valueJavaUnicodeStart;
519
        }
520
        else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
521
                || us.generalCategory == UnicodeSpec.NON_SPACING_MARK
522
                || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
523
            resultA |= valueJavaUnicodePart;
524
        }
525
        else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
526
            resultA |= valueJavaStartUnicodePart;
527
        }
528
        else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
529
            resultA |= valueJavaOnlyStart;
530
        }
531
        else if (((c >= 0x0000) && (c <= 0x0008))
532
                || ((c >= 0x000E) && (c <= 0x001B))
533
                || ((c >= 0x007F) && (c <= 0x009F))
534
                || us.generalCategory == UnicodeSpec.FORMAT) {
535
            resultA |= valueIgnorable;
536
        }
537
        else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
538
                || us.generalCategory == UnicodeSpec.LINE_SEPARATOR
539
                || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
540
            if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
541
        }
542
        else if (((c >= 0x0009) && (c <= 0x000D))
543
                || ((c >= 0x001C) && (c <= 0x001F))) {
544
            resultA |= valueJavaWhitespace;
545
        }
546

547
        // record bidi category
548
        if (!nobidi) {
549
            int tmpBidi =
550
                (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
551
                    us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
552
            resultA |= tmpBidi;
553
        }
554

555
        // record mirrored property
556
        if (!nomirror) {
557
            resultA |= us.mirrored ? maskMirrored : 0;
558
        }
559

560
        if (identifiers) {
561
            long replacement = 0;
562
            if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
563
                replacement |= bitJavaStart;
564
            }
565
            if ( ((resultA & nonzeroJavaPart) != 0)
566
                    && ((resultA & maskIdentifierInfo) != valueIgnorable)) {
567
                replacement |= bitJavaPart;
568
            }
569
            resultA = replacement;
570
        }
571
        return resultA;
572
    }
573

574
    static void addExProp(long[] map, PropList propList, String prop, long mask) {
575
        List<Integer> cps = propList.codepoints(prop);
576
        if (cps != null) {
577
            for (Integer cp : cps) {
578
                if (cp < map.length)
579
                    map[cp] |= mask;
580
            }
581
        }
582
    }
583

584
    /**
585
    * This is the heart of the table compression strategy.  The inputs are a map
586
    * and a number of bits (size).  The map is simply an array of long integer values;
587
    * the number of bits indicates how index values for that map are to be split.
588
    * The length of the given map must be a multiple of (1 << size).  The result is
589
    * a new map z and a compressed table t such that for every valid index value k
590
    * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
591
    *
592
    * In other words, the index k can be split into two parts, namely the "size"
593
    * low-order bits and all the remaining high-order bits; the high-order bits are then
594
    * remapped by map z to produce an index into table t.  In effect, the data of the
595
    * original map m is broken up into blocks of size (1<<size); the compression relies
596
    * on the expectation that many of these blocks will be identical and therefore need
597
    * be represented only once in the compressed table t.
598
    *
599
    * This method is intended to be used iteratively.  The first map to be handed
600
    * to it is the one constructed by method buildMap.  After that, the first of the
601
    * two arrays returned by this method is fed back into it for further compression.
602
    * At the end of the iteration, one has a starter map and a sequence of tables.
603
    *
604
    * The algorithm used to implement this computation is straightforward and not
605
    * especially clever.  It uses brute-force linear search (the loop labeled MIDDLE)
606
    * to locate identical blocks, so overall the time complexity of the algorithm
607
    * is quadratic in the length of the input map.  Fortunately, speed is not crucial
608
    * to this application.
609
    *
610
    * @param map                a map to be compressed
611
    * @param size       the number of index bits to be split off by the compression
612
    * @return   an array of length 2 containing two arrays; the first is a new map
613
    *           and the second is a compressed data table
614
    *
615
    * @see GenerateCharacter#buildMap
616
    */
617

618
    static long[][] buildTable(long[] map, int size) {
619
        int n = map.length;
620
        if (((n >> size) << size) != n) {
621
            FAIL("Length " + n + " is not a multiple of " + (1 << size));
622
        }
623
        int m = 1 << size;
624
        // We know the final length of the new map up front.
625
        long[] newmap = new long[n >> size];
626
        // The buffer is used temporarily to hold data for the compressed table
627
        // because we don't know its final length yet.
628
        long[] buffer = new long[n];
629
        int ptr = 0;
630
OUTER:  for (int i = 0; i < n; i += m) {
631
            // For every block of size m in the original map...
632
    MIDDLE: for (int j = 0; j < ptr; j += m) {
633
            // Find out whether there is already a block just like it in the buffer.
634
                for (int k = 0; k < m; k++) {
635
                    if (buffer[j+k] != map[i+k])
636
                        continue MIDDLE;
637
                }
638
                // There is a block just like it at position j, so just
639
                // put its index into the new map (thereby sharing it).
640
                newmap[i >> size] = (j >> size);
641
                continue OUTER;
642
            } // end MIDDLE
643
            // There is no block just like it already, so add it to
644
            // the buffer and put its index into the new map.
645
            for (int k = 0; k < m; k++) {
646
                buffer[ptr+k] = map[i+k];
647
            }
648
            newmap[i >> size] = (ptr >> size);
649
            ptr += m;
650
        } // end OUTER
651
        // Now we know how long the compressed table should be,
652
        // so create a new array and copy data from the temporary buffer.
653
        long[] newdata = new long[ptr];
654
        for (int j = 0; j < ptr; j++) {
655
            newdata[j] = buffer[j];
656
        }
657
        // Return the new map and the new data table.
658
        long[][] result = { newmap, newdata };
659
        return result;
660
    }
661

662
    /**
663
    * Once the compressed tables have been computed, this method reads in a
664
    * template file for the source code to be generated and writes out the final
665
    * source code by acting as a sort of specialized macro processor.
666
    *
667
    * The first output line is a comment saying that the file was automatically
668
    * generated; it includes a timestamp.  All other output is generated by
669
    * reading a line from the template file, performing macro replacements,
670
    * and then writing the resulting line or lines of code to the output file.
671
    *
672
    * This method handles the I/O, the timestamp comment, and the locating of
673
    * macro calls within each input line.  The method replaceCommand is called
674
    * to generate replacement text for each macro call.
675
    *
676
    * Macro calls to be replaced are indicated in the template file by
677
    * occurrences of the commandMarker "$$".  The rest of the call may consist
678
    * of Java letters (including the underscore "_") and also of balanced
679
    * parentheses.
680
    *
681
    * @param theTemplateFileName
682
    *           the file name for the template input file
683
    * @param theOutputFileName
684
    *           the file name for the source code output file
685
    *
686
    *     @see GenerateCharacter#replaceCommand
687
    */
688

689
    static void generateCharacterClass(String theTemplateFileName,
690
                                       String theOutputFileName)
691
        throws FileNotFoundException, IOException {
692
        BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
693
        PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
694
        out.println(commentStart +
695
            " This file was generated AUTOMATICALLY from a template file " +
696
            new java.util.Date() + commentEnd);
697
        int marklen = commandMarker.length();
698
        LOOP: while(true) {
699
            try {
700
                String line = in.readLine();
701
                if (line == null) break LOOP;
702
                int pos = 0;
703
                int depth = 0;
704
                while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
705
                    int newpos = pos + marklen;
706
                    char ch = 'x';
707
                    SCAN: while (newpos < line.length() &&
708
                            (Character.isJavaIdentifierStart(ch = line.charAt(newpos))
709
                            || ch == '(' || (ch == ')' && depth > 0))) {
710
                        ++newpos;
711
                        if (ch == '(') {
712
                            ++depth;
713
                        }
714
                        else if (ch == ')') {
715
                            --depth;
716
                            if (depth == 0)
717
                                break SCAN;
718
                        }
719
                    }
720
                    String replacement = replaceCommand(line.substring(pos + marklen, newpos));
721
                    line = line.substring(0, pos) + replacement + line.substring(newpos);
722
                    pos += replacement.length();
723
                }
724
                out.println(line);
725
            }
726
            catch (IOException e) {
727
                break LOOP;
728
            }
729
        }
730
        in.close();
731
        out.close();
732
    }
733

734
    /**
735
    * The replaceCommand method takes a command (a macro call without the
736
    * leading marker "$$") and computes replacement text for it.
737
    *
738
    * Most of the commands are simply names of integer constants that are defined
739
    * in the source code of this GenerateCharacter class.  The replacement text is
740
    * simply the value of the constant as an appropriately formatted integer literal.
741
    *
742
    * Two cases are more complicated, however.  The command "Tables" causes the
743
    * final map and compressed tables to be emitted, with elaborate comments
744
    * describing their contents.  (This is actually handled by method genTables.)
745
    * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
746
    * an expression that will return the character property data for the character
747
    * whose code is the value of the variable "xxx".  (this is handled by method
748
    * "genAccess".)
749
    *
750
    * @param x  a command from the template file to be replaced
751
    * @return   the replacement text, as a String
752
    *
753
    * @see GenerateCharacter#genTables
754
    * @see GenerateCharacter#genAccess
755
    * @see GenerateCharacter#generateCharacterClass
756
    */
757

758
    static String replaceCommand(String x) {
759
        if (x.equals("Tables")) return genTables();
760
        if (x.equals("Initializers")) return genInitializers();
761
        if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
762
                x.substring(x.length()-1).equals(")") )
763
            return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
764
        if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
765
                x.substring(x.length()-1).equals(")") )
766
            return genAccess("B", x.substring(9, x.length()-1), 16);
767
        if (x.equals("shiftType")) return Long.toString(shiftType);
768
        if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
769
        if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
770
        if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
771
        if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
772
        if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
773
        if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
774
        if (x.equals("maskCase")) return "0x" + hex8(maskCase);
775
        if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
776
        if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
777
        if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
778
        if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
779
        if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
780
        if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
781
        if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
782
        if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
783
        if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
784
        if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
785
        if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
786
        if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
787
        if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
788
        if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
789
        if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
790
        if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
791
        if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
792
        if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
793
        if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
794
        if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
795
        if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
796
        if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
797
        if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
798
        if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
799
        if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
800
        if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
801
        if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
802
        if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
803
        if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
804
        if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
805
        if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
806
        if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
807
        if (x.equals("maskType")) return "0x" + hex(maskType);
808
        if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
809
        if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
810
        if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
811
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
812
            return Integer.toString(UnicodeSpec.UNASSIGNED);
813
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
814
            return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
815
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
816
            return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
817
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
818
            return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
819
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
820
             return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
821
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
822
             return Integer.toString(UnicodeSpec.OTHER_LETTER);
823
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
824
             return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
825
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
826
             return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
827
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
828
             return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
829
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
830
             return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
831
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
832
             return Integer.toString(UnicodeSpec.OTHER_NUMBER);
833
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
834
             return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
835
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
836
             return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
837
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
838
             return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
839
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
840
            return Integer.toString(UnicodeSpec.CONTROL);
841
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
842
            return Integer.toString(UnicodeSpec.FORMAT);
843
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
844
            return Integer.toString(UnicodeSpec.PRIVATE_USE);
845
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
846
            return Integer.toString(UnicodeSpec.SURROGATE);
847
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
848
            return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
849
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
850
            return Integer.toString(UnicodeSpec.START_PUNCTUATION);
851
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
852
            return Integer.toString(UnicodeSpec.END_PUNCTUATION);
853
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
854
            return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
855
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
856
            return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
857
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
858
            return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
859
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
860
            return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
861
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
862
            return Integer.toString(UnicodeSpec.LETTER_NUMBER);
863
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
864
            return Integer.toString(UnicodeSpec.MATH_SYMBOL);
865
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
866
            return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
867
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
868
            return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
869
        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
870
            return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
871
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
872
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
873
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
874
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
875
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
876
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
877
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
878
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
879
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
880
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
881
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
882
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
883
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
884
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
885
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
886
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
887
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
888
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
889
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
890
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
891
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
892
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
893
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
894
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
895
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
896
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
897
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
898
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
899
         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
900
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
901
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
902
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
903
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
904
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
905
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
906
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
907
        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
908
            return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
909
        FAIL("Unknown text substitution marker " + commandMarker + x);
910
        return commandMarker + x;
911
    }
912

913
    /**
914
    * The genTables method generates source code for all the lookup tables
915
    * needed to represent the various Unicode character properties.
916
    * It simply calls the method genTable once for each table to be generated
917
    * and then generates a summary comment.
918
    *
919
    * @return   the replacement text for the "Tables" command, as a String
920
    *
921
    * @see GenerateCharacter#genTable
922
    * @see GenerateCharacter#replaceCommand
923
    */
924
    static String genTables() {
925
        int n = sizes.length;
926
        StringBuffer result = new StringBuffer();
927
        // liu : Add a comment showing the source of this table
928
        result.append(commentStart + " The following tables and code generated using:" +
929
                  commentEnd + "\n  ");
930
        result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n  ");
931

932
                if (plane == 0 && bLatin1 == false) {
933
            genCaseMapTableDeclaration(result);
934
            genCaseMapTable(initializers, specialCaseMaps);
935
                }
936
        int totalBytes = 0;
937
        for (int k = 0; k < n - 1; k++) {
938
            genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
939
                sizes[k+1], false, false, k==0);
940
            int s = bytes[k];
941
            if (s == 1 && useCharForByte) {
942
                s = 2;
943
            }
944
            totalBytes += tables[k].length * s;
945
        }
946
        genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
947
            sizes[n - 1], false, 0, true, !(identifiers), false);
948

949
        // If we ever need more than 32 bits to represent the character properties,
950
        // then a table "B" may be needed as well.
951
        genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
952

953
        totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
954
        result.append(commentStart);
955
        result.append(" In all, the character property tables require ");
956
        result.append(totalBytes).append(" bytes.").append(commentEnd);
957
        if (verbose) {
958
            System.out.println("The character property tables require "
959
                 + totalBytes + " bytes.");
960
        }
961
        return result.toString();
962
    }
963

964
    /**
965
     * The genInitializers method generates the body of the
966
     * ensureInitted() method, which enables lazy initialization of
967
     * the case map table and other tables.
968
     */
969
    static String genInitializers() {
970
        return initializers.toString();
971
    }
972

973
    /**
974
     * Return the total number of bytes needed by all tables.  This is a stripped-
975
     * down copy of genTables().
976
     */
977
    static int getTotalBytes() {
978
        int n = sizes.length;
979
        int totalBytes = 0;
980
        for (int k = 0; k < n - 1; k++) {
981
            totalBytes += tables[k].length * bytes[k];
982
        }
983
        totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
984
                         + 31) >> 5) << 2);
985
        return totalBytes;
986
    }
987

988
    static void appendEscapedStringFragment(StringBuffer result,
989
                                            char[] line,
990
                                            int length,
991
                                            boolean lastFragment) {
992
        result.append("    \"");
993
        for (int k=0; k<length; ++k) {
994
            result.append("\\u");
995
            result.append(hex4(line[k]));
996
        }
997
        result.append("\"");
998
        result.append(lastFragment ? ";" : "+");
999
        result.append("\n");
1000
    }
1001

1002
    static String SMALL_INITIALIZER =
1003
        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1004
        // "            $$name = new $$type[$$size];\n"+
1005
        "            int len = $$name_DATA.length();\n"+
1006
        "            int j=0;\n"+
1007
        "            for (int i=0; i<len; ++i) {\n"+
1008
        "                int c = $$name_DATA.charAt(i);\n"+
1009
        "                for (int k=0; k<$$entriesPerChar; ++k) {\n"+
1010
        "                    $$name[j++] = ($$type)c;\n"+
1011
        "                    c >>= $$bits;\n"+
1012
        "                }\n"+
1013
        "            }\n"+
1014
        "            assert (j == $$size);\n"+
1015
        "        }\n";
1016

1017
    static String SAME_SIZE_INITIALIZER =
1018
        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1019
        "            assert ($$name_DATA.length() == $$size);\n"+
1020
        // "            $$name = new $$type[$$size];\n"+
1021
        "            for (int i=0; i<$$size; ++i)\n"+
1022
        "                $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
1023
        "        }\n";
1024

1025
    static String BIG_INITIALIZER =
1026
        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1027
        // "            $$name = new $$type[$$size];\n"+
1028
        "            int len = $$name_DATA.length();\n"+
1029
        "            int j=0;\n"+
1030
        "            int charsInEntry=0;\n"+
1031
        "            $$type entry=0;\n"+
1032
        "            for (int i=0; i<len; ++i) {\n"+
1033
        "                entry |= $$name_DATA.charAt(i);\n"+
1034
        "                if (++charsInEntry == $$charsPerEntry) {\n"+
1035
        "                    $$name[j++] = entry;\n"+
1036
        "                    entry = 0;\n"+
1037
        "                    charsInEntry = 0;\n"+
1038
        "                }\n"+
1039
        "                else {\n"+
1040
        "                    entry <<= 16;\n"+
1041
        "                }\n"+
1042
        "            }\n"+
1043
        "            assert (j == $$size);\n"+
1044
        "        }\n";
1045

1046
    static String INT32_INITIALIZER =
1047
        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1048
        "            char[] data = $$name_DATA.toCharArray();\n"+
1049
        "            assert (data.length == ($$size * 2));\n"+
1050
        "            int i = 0, j = 0;\n"+
1051
        "            while (i < ($$size * 2)) {\n"+
1052
        "                int entry = data[i++] << 16;\n"+
1053
        "                $$name[j++] = entry | data[i++];\n"+
1054
        "            }\n"+
1055
        "        }\n";
1056

1057
    static void addInitializer(String name, String type, int entriesPerChar,
1058
                               int bits, int size) {
1059

1060
        String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1061
                          ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1062
        if (entriesPerChar == -2) {
1063
            template = INT32_INITIALIZER;
1064
        }
1065
        int marklen = commandMarker.length();
1066
        int pos = 0;
1067
        while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1068
            int newpos = pos + marklen;
1069
            char ch = 'x';
1070
            while (newpos < template.length() &&
1071
                   Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1072
                   ch != '_') // Don't allow this in token names
1073
                ++newpos;
1074
            String token = template.substring(pos+marklen, newpos);
1075
            String replacement = "ERROR";
1076

1077
            if (token.equals("name")) replacement = name;
1078
            else if (token.equals("type")) replacement = type;
1079
            else if (token.equals("bits")) replacement = ""+bits;
1080
            else if (token.equals("size")) replacement = ""+size;
1081
            else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1082
            else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1083
            else FAIL("Unrecognized token: " + token);
1084

1085
            template = template.substring(0, pos) + replacement + template.substring(newpos);
1086
            pos += replacement.length();
1087
        }
1088
        initializers.append(template);
1089
    }
1090

1091
    /**
1092
    * The genTable method generates source code for one lookup table.
1093
    * Most of the complexity stems from handling various options as to
1094
    * the type of the array components, the precise representation of the
1095
    * values, the format in which to render each value, the number of values
1096
    * to emit on each line of source code, and the kinds of useful comments
1097
    * to be generated.
1098
    *
1099
    * @param result     a StringBuffer, to which the generated source code
1100
    *                   text is to be appended
1101
    * @param name       the name of the table
1102
    * @param table      the table data (an array of long values)
1103
    * @param extract    a distance, in bits, by which each entry of the table
1104
    *                   is to be right-shifted before it is processed
1105
    * @param bits       the number of bits (not bytes) to be used to represent
1106
    *                   each table entry
1107
    * @param size       the table data is divided up into blocks of size (1<<size);
1108
    *                   in this method, this information is used only to affect
1109
    *                   how many table values are to be generated per line
1110
    * @param preshifted if this flag is true, then the table entries are to be
1111
    *                   emitted in a preshifted form; that is, each value should
1112
    *                   be left-shifted by the amount "shift", so that this work
1113
    *                   is built into the table and need not be performed by an
1114
    *                   explicit shift operator at run time
1115
    * @param shift      this is the shift amount for preshifting of table entries
1116
    * @param hexFormat  if this flag is true, table entries should be emitted as
1117
    *                   hexadecimal literals; otherwise decimal literals are used
1118
    * @param properties if this flag is true, the table entries are encoded
1119
    *                   character properties rather than indexes into yet other tables;
1120
    *                   therefore comments describing the encoded properties should
1121
    *                   be generated
1122
    * @param hexComment if this flag is true, each line of output is labelled with
1123
    *                   a hexadecimal comment indicating the character values to
1124
    *                   which that line applies; otherwise, decimal values indicating
1125
    *                   table indices are generated
1126
    *
1127
    * @see GenerateCharacter#genTables
1128
    * @see GenerateCharacter#replaceCommand
1129
    */
1130

1131
    static void genTable(StringBuffer result, String name,
1132
                         long[] table, int extract, int bits, int size,
1133
                         boolean preshifted, int shift, boolean hexFormat,
1134
                         boolean properties, boolean hexComment) {
1135

1136
        String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1137
            bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1138
            bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1139
            bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1140
            bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1141
            bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1142
            (Csyntax ? "int64" : "long");
1143
        long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1144
            bits == 2 ? Integer.MAX_VALUE :
1145
            bits == 4 ? Integer.MAX_VALUE :
1146
            bits == 8 ? Byte.MAX_VALUE :
1147
            bits == 16 ? Short.MAX_VALUE :
1148
            bits == 32 ? Integer.MAX_VALUE :
1149
            Long.MAX_VALUE;
1150
        int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1151
        boolean shiftEntries = preshifted && shift != 0;
1152
        if (bits == 8 && tableAsString && useCharForByte) {
1153
            atype = "char";
1154
            maxPosEntry = Character.MAX_VALUE;
1155
            entriesPerChar = 1;
1156
        }
1157
        boolean noConversion = atype.equals("char");
1158

1159
        result.append(commentStart);
1160
        result.append(" The ").append(name).append(" table has ").append(table.length);
1161
        result.append(" entries for a total of ");
1162
        int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1163
        if (bits == 8 && useCharForByte) {
1164
            sizeOfTable *= 2;
1165
        }
1166
        result.append(sizeOfTable);
1167
        result.append(" bytes.").append(commentEnd).append("\n\n");
1168
        if (Csyntax)
1169
            result.append("  static ");
1170
        else
1171
            result.append("  static final ");
1172
        result.append(atype);
1173
        result.append(" ").append(name).append("[");
1174
        if (Csyntax)
1175
            result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1176
        if (tableAsString) {
1177
            if (noConversion) {
1178
                result.append("] = (\n");
1179
            } else {
1180
                result.append("] = new ").append(atype).append("["+table.length+"];\n  ");
1181
                result.append("static final String ").append(name).append("_DATA =\n");
1182
            }
1183
            int CHARS_PER_LINE = 8;
1184
            StringBuffer theString = new StringBuffer();
1185
            int entriesInCharSoFar = 0;
1186
            char ch = '\u0000';
1187
            int charsPerEntry = -entriesPerChar;
1188
            for (int j=0; j<table.length; ++j) {
1189
                //long entry = table[j] >> extract;
1190
                long entry;
1191
                if ("A".equals(name))
1192
                    entry = (table[j] & 0xffffffffL) >> extract;
1193
                else
1194
                    entry = (table[j] >> extract);
1195
                if (shiftEntries) entry <<= shift;
1196
                if (entry >= (1L << bits)) {
1197
                    FAIL("Entry too big");
1198
                }
1199
                if (entriesPerChar > 0) {
1200
                    // Pack multiple entries into a character
1201
                    ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1202
                    ++entriesInCharSoFar;
1203
                    if (entriesInCharSoFar == entriesPerChar) {
1204
                        // Character is full
1205
                        theString.append(ch);
1206
                        entriesInCharSoFar = 0;
1207
                        ch = '\u0000';
1208
                    }
1209
                }
1210
                else {
1211
                    // Use multiple characters per entry
1212
                    for (int k=0; k<charsPerEntry; ++k) {
1213
                        ch = (char)(entry >> ((charsPerEntry-1)*16));
1214
                        entry <<= 16;
1215
                        theString.append(ch);
1216
                    }
1217
                }
1218
            }
1219
            if (entriesInCharSoFar > 0) {
1220
                while (entriesInCharSoFar < entriesPerChar) {
1221
                    ch = (char)((int)ch >> bits);
1222
                    ++entriesInCharSoFar;
1223
                }
1224
                theString.append(ch);
1225
                entriesInCharSoFar = 0;
1226
            }
1227
            result.append(Utility.formatForSource(theString.toString(), "    "));
1228
            if (noConversion) {
1229
                result.append(").toCharArray()");
1230
            }
1231
            result.append(";\n\n  ");
1232

1233
            if (!noConversion) {
1234
                addInitializer(name, atype, entriesPerChar, bits, table.length);
1235
            }
1236
        }
1237
        else {
1238
            result.append("] = {");
1239
            boolean castEntries = shiftEntries && (bits < 32);
1240
            int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1241
                bits == 2 ? 16*4 :
1242
                bits == 4 ? 8*4 :
1243
                bits == 8 ? 8 :
1244
                bits == 16 ? 8 :
1245
                bits == 32 ? 4 : 2) :
1246
                (bits == 8 ? 8 :
1247
                bits == 16 ? 8 : 4);
1248
            int printMask = properties ? 0 :
1249
            Math.min(1 << size,
1250
                printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1251
            int commentShift = ((1 << size) == table.length) ? 0 : size;
1252
            int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1253
            long val = 0;
1254
            for (int j = 0; j < table.length; j++) {
1255
                if ((j & printMask) == 0) {
1256
                    while (result.charAt(result.length() - 1) == ' ')
1257
                        result.setLength(result.length() - 1);
1258
                    result.append("\n    ");
1259
                }
1260
        PRINT:  {
1261
                if (castEntries)
1262
                    result.append("(").append(atype).append(")(");
1263
                long entry = table[j] >> extract;
1264
                int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1265
                int k = j & packMask;
1266
                if (bits >= 8)
1267
                    val = entry;
1268
                else if (k == 0) {
1269
                    val = entry;
1270
                    break PRINT;
1271
                }
1272
                else {
1273
                    val |= (entry << (k*bits));
1274
                    if (k != packMask)
1275
                        break PRINT;
1276
                }
1277
                if (val > maxPosEntry && !Csyntax) { // liu
1278
                // For values that are out of range, convert them to in-range negative values.
1279
                // Actually, output the '-' and convert them to the negative of the corresponding
1280
                // in-range negative values.  E.g., convert 130 == -126 (in 8 bits) -> 126.
1281
                    result.append('-');
1282
                    val = maxPosEntry + maxPosEntry + 2 - val;
1283
                }
1284
                if (hexFormat) {
1285
                    result.append("0x");
1286
                    if (bits == 8)
1287
                        result.append(hex2((byte)val));
1288
                    else if (bits == 16)
1289
                        result.append(hex4((short)val));
1290
                    else if (bits == 32 || bits < 8)
1291
                        result.append(hex8((int)val));
1292
                    else {
1293
                        result.append(hex16(val));
1294
                        if (!Csyntax)
1295
                            result.append("L");
1296
                    }
1297
                }
1298
                else {
1299
                    if (bits == 8)
1300
                        result.append(dec3(val));
1301
                    else if (bits == 64) {
1302
                        result.append(dec5(val));
1303
                        if (!Csyntax)
1304
                            result.append("L");
1305
                    }
1306
                    else
1307
                        result.append(dec5(val));
1308
                }
1309
                if (shiftEntries)
1310
                    result.append("<<").append(shift);
1311
                if (castEntries) result.append(")");
1312
                if (j < (table.length - 1))
1313
                    result.append(", ");
1314
                else
1315
                    result.append("  ");
1316
                if ((j & printMask) == printMask) {
1317
                    result.append(" ").append(commentStart).append(" ");
1318
                    if (hexComment)
1319
                        result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1320
                    else
1321
                        result.append(dec3((j & ~commentMask) >> commentShift));
1322
                    if (properties) propertiesComments(result, val);
1323
                    result.append(commentEnd);
1324
                }
1325
                } // end PRINT
1326
            }
1327
            result.append("\n  };\n\n  ");
1328
        }
1329
    }
1330

1331
    static void genCaseMapTableDeclaration(StringBuffer result) {
1332
        String myTab = "    ";
1333
        result.append(myTab + "static final char[][][] charMap;\n");
1334
    }
1335

1336
    static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1337
        String myTab = "    ";
1338
        int ch;
1339
        char[] map;
1340
        result.append(myTab + "charMap = new char[][][] {\n");
1341
        for (int x = 0; x < specialCaseMaps.length; x++) {
1342
            ch = specialCaseMaps[x].getCharSource();
1343
            map = specialCaseMaps[x].getUpperCaseMap();
1344
            result.append(myTab + myTab);
1345
            result.append("{ ");
1346
            result.append("{\'\\u"+hex4(ch)+"\'}, {");
1347
            for (int y = 0; y < map.length; y++) {
1348
                result.append("\'\\u"+hex4(map[y])+"\', ");
1349
            }
1350
            result.append("} },\n");
1351
        }
1352
        result.append(myTab + "};\n");
1353

1354
    }
1355

1356
    /**
1357
    * The propertiesComments method generates comments describing encoded
1358
    * character properties.
1359
    *
1360
    * @param result     a StringBuffer, to which the generated source code
1361
    *                   text is to be appended
1362
    * @param val                encoded character properties
1363
    *
1364
    * @see GenerateCharacter#genTable
1365
    */
1366

1367
    static void propertiesComments(StringBuffer result, long val) {
1368
        result.append("   ");
1369
        switch ((int)(val & maskType)) {
1370
            case UnicodeSpec.CONTROL:
1371
                result.append("Cc");
1372
                break;
1373
            case UnicodeSpec.FORMAT:
1374
                result.append("Cf");
1375
                break;
1376
            case UnicodeSpec.PRIVATE_USE:
1377
                result.append("Co");
1378
                break;
1379
            case UnicodeSpec.SURROGATE:
1380
                result.append("Cs");
1381
                break;
1382
            case UnicodeSpec.LOWERCASE_LETTER:
1383
                result.append("Ll");
1384
                break;
1385
            case UnicodeSpec.MODIFIER_LETTER:
1386
                result.append("Lm");
1387
                break;
1388
            case UnicodeSpec.OTHER_LETTER:
1389
                result.append("Lo");
1390
                break;
1391
            case UnicodeSpec.TITLECASE_LETTER:
1392
                result.append("Lt");
1393
                break;
1394
            case UnicodeSpec.UPPERCASE_LETTER:
1395
                result.append("Lu");
1396
                break;
1397
            case UnicodeSpec.COMBINING_SPACING_MARK:
1398
                result.append("Mc");
1399
                break;
1400
            case UnicodeSpec.ENCLOSING_MARK:
1401
                result.append("Me");
1402
                break;
1403
            case UnicodeSpec.NON_SPACING_MARK:
1404
                result.append("Mn");
1405
                break;
1406
            case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1407
                result.append("Nd");
1408
                break;
1409
            case UnicodeSpec.LETTER_NUMBER:
1410
                result.append("Nl");
1411
                break;
1412
            case UnicodeSpec.OTHER_NUMBER:
1413
                result.append("No");
1414
                break;
1415
            case UnicodeSpec.CONNECTOR_PUNCTUATION:
1416
                result.append("Pc");
1417
                break;
1418
            case UnicodeSpec.DASH_PUNCTUATION:
1419
                result.append("Pd");
1420
                break;
1421
            case UnicodeSpec.END_PUNCTUATION:
1422
                result.append("Pe");
1423
                break;
1424
            case UnicodeSpec.OTHER_PUNCTUATION:
1425
                result.append("Po");
1426
                break;
1427
            case UnicodeSpec.START_PUNCTUATION:
1428
                result.append("Ps");
1429
                break;
1430
            case UnicodeSpec.CURRENCY_SYMBOL:
1431
                result.append("Sc");
1432
                break;
1433
            case UnicodeSpec.MODIFIER_SYMBOL:
1434
                result.append("Sk");
1435
                break;
1436
            case UnicodeSpec.MATH_SYMBOL:
1437
                result.append("Sm");
1438
                break;
1439
            case UnicodeSpec.OTHER_SYMBOL:
1440
                result.append("So");
1441
                break;
1442
            case UnicodeSpec.LINE_SEPARATOR:
1443
                result.append("Zl"); break;
1444
            case UnicodeSpec.PARAGRAPH_SEPARATOR:
1445
                result.append("Zp");
1446
                break;
1447
            case UnicodeSpec.SPACE_SEPARATOR:
1448
                result.append("Zs");
1449
                break;
1450
            case UnicodeSpec.UNASSIGNED:
1451
                result.append("unassigned");
1452
                break;
1453
        }
1454

1455
        switch ((int)((val & maskBidi) >> shiftBidi)) {
1456
            case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1457
                result.append(", L");
1458
                break;
1459
            case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1460
                result.append(", R");
1461
                break;
1462
            case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1463
                result.append(", EN");
1464
                break;
1465
            case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1466
                result.append(", ES");
1467
                break;
1468
            case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1469
                result.append(", ET");
1470
                break;
1471
            case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1472
                result.append(", AN");
1473
                break;
1474
            case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1475
                result.append(", CS");
1476
                break;
1477
            case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1478
                result.append(", B");
1479
                break;
1480
            case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1481
                result.append(", S");
1482
                break;
1483
            case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1484
                result.append(", WS");
1485
                break;
1486
            case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1487
                result.append(", ON");
1488
                break;
1489
        }
1490
        if ((val & maskUpperCase) != 0) {
1491
            result.append(", hasUpper (subtract ");
1492
            result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1493
        }
1494
        if ((val & maskLowerCase) != 0) {
1495
            result.append(", hasLower (add ");
1496
            result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1497
        }
1498
        if ((val & maskTitleCase) != 0) {
1499
            result.append(", hasTitle");
1500
        }
1501
        if ((val & maskIdentifierInfo) == valueIgnorable) {
1502
            result.append(", ignorable");
1503
        }
1504
        if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1505
            result.append(", identifier part");
1506
        }
1507
        if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1508
            result.append(", underscore");
1509
        }
1510
        if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1511
            result.append(", whitespace");
1512
        }
1513
        if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1514
            result.append(", currency");
1515
        }
1516
        if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1517
            result.append(", identifier start");
1518
        }
1519
        if ((val & maskNumericType) == valueDigit) {
1520
            result.append(", decimal ");
1521
            result.append((val & maskDigitOffset) >> shiftDigitOffset);
1522
        }
1523
        if ((val & maskNumericType) == valueStrangeNumeric) {
1524
            result.append(", strange");
1525
        }
1526
        if ((val & maskNumericType) == valueJavaSupradecimal) {
1527
            result.append(", supradecimal ");
1528
            result.append((val & maskDigitOffset) >> shiftDigitOffset);
1529
        }
1530
    }
1531

1532
    static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1533

1534
    static String tableName(int j) { return tableNames[j]; }
1535

1536
    /**
1537
    * The genAccess method generates source code for one table access expression.
1538
    *
1539
    * Most of the complexity stems from handling various options as to
1540
    * table representation, such as whether it contains values so large that
1541
    * they are represented as negative values and whether the table values are
1542
    * preshifted.  This method also avoids such "ugly" expressions as shifting
1543
    * by distance zero, masking when no masking is necessary, and so on.
1544
    * For clarity, it generates expressions that do not rely on operator
1545
    * precedence, but otherwise it avoids generating redundant parentheses.
1546
    *
1547
    * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1548
    * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1549
    *
1550
    * @param tbl                the name of the final table to be accessed
1551
    * @param var                the variable name that appeared in parentheses in the
1552
    *                           "Lookup" command
1553
    * @param bits       the number of bits (not bytes) to be used to represent
1554
    *                   the final table entry
1555
    * @return   the replacement text for the "Lookup(xxx)" command, as a String
1556
    *
1557
    * @see GenerateCharacter#replaceCommand
1558
    */
1559

1560
    static String genAccess(String tbl, String var, int bits) {
1561
        String access = null;
1562
        int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1563
        for (int k = 0; k < sizes.length; k++) {
1564
            int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1565
            int shift = shifts[k] + offset;
1566
            String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1567
            int mask = (1 << (sizes[k] - offset)) - 1;
1568
            String masked = (k == 0) ? shifted :
1569
              "(" + shifted + "&0x" + hex(mask) + ")";
1570
            String index = (k == 0) ? masked :
1571
             (mask == 0) ? access : "(" + access + "|" + masked + ")";
1572
            String indexNoParens = (index.charAt(0) != '(') ? index :
1573
                 index.substring(1, index.length() - 1);
1574
            String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1575
            String fetched = tblname + "[" + indexNoParens + "]";
1576
            String zeroextended = (zeroextend[k] == 0) ? fetched :
1577
                "(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1578
            int adjustment = preshifted[k] ? 0 :
1579
               sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1580
            String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1581
                "(" + zeroextended + "<<" + adjustment + ")";
1582
            String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1583
                (bits == 2) ? "((" + var + "&0xF)<<1)" :
1584
                (bits == 4) ? "((" + var + "&7)<<2)" : null;
1585
            String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1586
                "((" + adjusted + ">>" + bitshift + ")&" +
1587
                (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1588
            access = extracted;
1589
        }
1590
        return access;
1591
    }
1592

1593
    /* The command line arguments are decoded and used to set the following
1594
     global variables.
1595
     */
1596

1597
    static boolean verbose = false;
1598
    static boolean nobidi = false;
1599
    static boolean nomirror = false;
1600
    static boolean identifiers = false;
1601
    static boolean Csyntax = false;
1602
    static String TemplateFileName = null;
1603
    static String OutputFileName = null;
1604
    static String UnicodeSpecFileName = null; // liu
1605
    static String SpecialCasingFileName = null;
1606
    static String PropListFileName = null;
1607
    static boolean useCharForByte = false;
1608
    static int[] sizes;
1609
    static int bins = 0; // liu; if > 0, then perform search
1610
    static boolean tableAsString = false;
1611
    static boolean bLatin1 = false;
1612

1613
    static String commandLineDescription;
1614

1615
    /* Other global variables, equal in length to the "sizes" array. */
1616

1617
    static int[] shifts;
1618
    static int[] zeroextend;
1619
    static int[] bytes;
1620
    static boolean[] preshifted;
1621
    static long[][] tables;
1622

1623

1624
    /* Other global variables */
1625
    static String commentStart;
1626
    static String commentEnd;
1627

1628
    static StringBuffer initializers = new StringBuffer();
1629

1630
    /* special casing rules for 1:M toUpperCase mappings */
1631
    static SpecialCaseMap[] specialCaseMaps;
1632

1633
    /**
1634
    * Process the command line arguments.
1635
    *
1636
    * The allowed flags in command line are:
1637
    * <dl>
1638
    * <dt> -verbose             <dd> Emit comments to standard output describing
1639
    *                                   what's going on during the processing.
1640
    * <dt> -nobidi              <dd> Do not include bidi categories in the
1641
    *                                   encoded character properties.
1642
    * <dt> -nomirror    <dd> Do no include mirror property in the encoded
1643
    *                        character properties.
1644
    * <dt> -identifiers         <dd> Generate tables for scanning identifiers only.
1645
    * <dt> -c                   <dd> Output code in C syntax instead of Java syntax.
1646
    * <dt> -o filename          <dd> Specify output file name.
1647
    * <dt> -template filename   <dd> Specify template input file name.
1648
    * <dt> -spec filename        <dd> Specify Unicode spec file name.
1649
    * <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1650
    * <dt> -search bins          <dd> Try different partitions into the specified
1651
    *                                    number of bins.  E.g., for 2 bins, try
1652
    *                                    16 0, 15 1,..., 0 16.
1653
    * <dt> -string               <dd> Create table as string.  Only valid with Java
1654
    *                                    syntax.
1655
    * <dt> -latin1          <dd> Create a latin 1 only property table.
1656
    * </dl>
1657
    * In addition, decimal literals may appear as command line arguments;
1658
    * each one represents the number of bits of the character to be broken
1659
    * off at each lookup step.  If present, they must add up to 16 (the number
1660
    * of bits in a char value).  For smaller tables, the last value should
1661
    * be 0; values other than the last one may not be zero.  If no such
1662
    * numeric values are provided, default values are used.
1663
    *
1664
    * @param args       the command line arguments, as an array of String
1665
    *
1666
    * @see GenerateCharacter#main
1667
    */
1668

1669
    static void processArgs(String[] args) {
1670
        StringBuffer desc = new StringBuffer("java GenerateCharacter");
1671
        for (int j=0; j<args.length; ++j) {
1672
            desc.append(" " + args[j]);
1673
        }
1674
        for (int j = 0; j < args.length; j++) {
1675
            if (args[j].equals("-verbose") || args[j].equals("-v"))
1676
                verbose = true;
1677
            else if (args[j].equals("-nobidi"))
1678
                nobidi = true;
1679
            else if (args[j].equals("-nomirror"))
1680
                nomirror = true;
1681
            else if (args[j].equals("-identifiers"))
1682
                identifiers = true;
1683
            else if (args[j].equals("-c"))
1684
                Csyntax = true;
1685
            else if (args[j].equals("-string"))
1686
                tableAsString = true;
1687
            else if (args[j].equals("-o")) {
1688
                if (j == args.length - 1) {
1689
                    FAIL("File name missing after -o");
1690
                }
1691
                else {
1692
                    OutputFileName = args[++j];
1693
                }
1694
            }
1695
            else if (args[j].equals("-search")) {
1696
                if (j == args.length - 1)
1697
                    FAIL("Bin count missing after -search");
1698
                else {
1699
                    bins = Integer.parseInt(args[++j]);
1700
                    if (bins < 1 || bins > 10)
1701
                        FAIL("Bin count must be >= 1 and <= 10");
1702
                }
1703
            }
1704
            else if (args[j].equals("-template")) {
1705
                if (j == args.length - 1)
1706
                    FAIL("File name missing after -template");
1707
                else
1708
                    TemplateFileName = args[++j];
1709
            }
1710
            else if (args[j].equals("-spec")) { // liu
1711
                if (j == args.length - 1) {
1712
                    FAIL("File name missing after -spec");
1713
                }
1714
                else {
1715
                    UnicodeSpecFileName = args[++j];
1716
                }
1717
            }
1718
            else if (args[j].equals("-specialcasing")) {
1719
                if (j == args.length -1) {
1720
                    FAIL("File name missing after -specialcasing");
1721
                }
1722
                else {
1723
                    SpecialCasingFileName = args[++j];
1724
                }
1725
            }
1726
            else if (args[j].equals("-proplist")) {
1727
                if (j == args.length -1) {
1728
                    FAIL("File name missing after -proplist");
1729
                }
1730
                else {
1731
                    PropListFileName = args[++j];
1732
                }
1733
            }
1734
            else if (args[j].equals("-plane")) {
1735
                if (j == args.length -1) {
1736
                    FAIL("Plane number missing after -plane");
1737
                }
1738
                else {
1739
                    plane = Integer.parseInt(args[++j]);
1740
                }
1741
                if (plane > 0) {
1742
                    bLatin1 = false;
1743
                }
1744
            }
1745
            else if ("-usecharforbyte".equals(args[j])) {
1746
                useCharForByte = true;
1747
            }
1748
            else if (args[j].equals("-latin1")) {
1749
                bLatin1 = true;
1750
                plane = 0;
1751
            }
1752
            else {
1753
                try {
1754
                    int val = Integer.parseInt(args[j]);
1755
                    if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1756
                    if (sizes == null)
1757
                        sizes = new int[1];
1758
                    else {
1759
                        int[] newsizes = new int[sizes.length + 1];
1760
                        System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1761
                        sizes = newsizes;
1762
                    }
1763
                    sizes[sizes.length - 1] = val;
1764
                }
1765
                catch(NumberFormatException e) {
1766
                    FAIL("Unknown switch: " + args[j]);
1767
                }
1768
            }
1769
        }
1770
        if (Csyntax && tableAsString) {
1771
            FAIL("Can't specify table as string with C syntax");
1772
        }
1773
        if (sizes == null) {
1774
            desc.append(" [");
1775
            if (identifiers) {
1776
                int[] newsizes = { 8, 4, 4 };           // Good default values
1777
                desc.append("8 4 4]");
1778
                sizes = newsizes;
1779
            }
1780
            else {
1781
                int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1782
                desc.append("10 5 1]");
1783
                sizes = newsizes;
1784
            }
1785
        }
1786
        if (UnicodeSpecFileName == null) { // liu
1787
            UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1788
            desc.append(" [-spec " + UnicodeSpecFileName + ']');
1789
        }
1790
        if (SpecialCasingFileName == null) {
1791
            SpecialCasingFileName = DefaultSpecialCasingFileName;
1792
            desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1793
        }
1794
        if (PropListFileName == null) {
1795
            PropListFileName = DefaultPropListFileName;
1796
            desc.append(" [-proplist " + PropListFileName + ']');
1797
        }
1798
        if (TemplateFileName == null) {
1799
            TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1800
                  : DefaultJavaTemplateFileName);
1801
            desc.append(" [-template " + TemplateFileName + ']');
1802
        }
1803
        if (OutputFileName == null) {
1804
            OutputFileName = (Csyntax ? DefaultCOutputFileName
1805
                    : DefaultJavaOutputFileName);
1806
            desc.append(" [-o " + OutputFileName + ']');
1807
        }
1808
        commentStart = (Csyntax ? "/*" : "//");
1809
        commentEnd = (Csyntax ? " */" : "");
1810
        commandLineDescription = desc.toString();
1811
    }
1812

1813
    private static void searchBins(long[] map, int binsOccupied) throws Exception {
1814
        int bitsFree = 16;
1815
        for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1816
        if (binsOccupied == (bins-1)) {
1817
            sizes[binsOccupied] = bitsFree;
1818
            generateForSizes(map);
1819
        }
1820
        else {
1821
            for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1822
                sizes[binsOccupied] = i;
1823
                searchBins(map, binsOccupied+1);
1824
            }
1825
        }
1826
    }
1827

1828
    private static void generateForSizes(long[] map) throws Exception {
1829
        int sum = 0;
1830
        shifts = new int[sizes.length];
1831
        for (int k = sizes.length - 1; k >= 0; k--) {
1832
            shifts[k] = sum;
1833
            sum += sizes[k];
1834
        }
1835
        if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1836
            FAIL("Bit field widths total to " + sum +
1837
             ": wrong total for map of size " + map.length);
1838
        }
1839
        // need a table for each set of lookup bits in char
1840
        tables = new long[sizes.length][];
1841
        // the last table is the map
1842
        tables[sizes.length - 1] = map;
1843
        for (int j = sizes.length - 1; j > 0; j--) {
1844
            if (verbose && bins==0)
1845
                System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1846
            long[][] temp = buildTable(tables[j], sizes[j]);
1847
            tables[j-1] = temp[0];
1848
            tables[j] = temp[1];
1849
        }
1850
        preshifted = new boolean[sizes.length];
1851
        zeroextend = new int[sizes.length];
1852
        bytes = new int[sizes.length];
1853
        for (int j = 0; j < sizes.length - 1; j++) {
1854
            int len = tables[j+1].length;
1855
            int size = sizes[j+1];
1856
            if (len > 0x100 && (len >> size) <= 0x100) {
1857
                len >>= size;
1858
                preshifted[j] = false;
1859
            }
1860
            else if (len > 0x10000 && (len >> size) <= 0x10000) {
1861
                len >>= size;
1862
                preshifted[j] = false;
1863
            }
1864
            else preshifted[j] = true;
1865
            if (Csyntax)
1866
                zeroextend[j] = 0;
1867
            else if (len > 0x7F && len <= 0xFF) {
1868
                if (!useCharForByte) {
1869
                    zeroextend[j] = 0xFF;
1870
                }
1871
            } else if (len > 0x7FFF && len <= 0xFFFF)
1872
                zeroextend[j] = 0xFFFF;
1873
            else zeroextend[j] = 0;
1874
            if (len <= 0x100) bytes[j] = 1;
1875
            else if (len <= 0x10000) bytes[j] = 2;
1876
            else bytes[j] = 4;
1877
        }
1878
        preshifted[sizes.length - 1] = true;
1879
        zeroextend[sizes.length - 1] = 0;
1880
        bytes[sizes.length - 1] = 0;
1881
        if (bins > 0) {
1882
            int totalBytes = getTotalBytes();
1883
            String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1884
            int accessComplexity = 0;
1885
            for (int j=0; j<access.length(); ++j) {
1886
                char ch = access.charAt(j);
1887
                if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1888
                if (ch == '<' || ch == '>') ++j;
1889
            }
1890
            System.out.print("(");
1891
            for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1892
            System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1893
            return;
1894
        }
1895
        if (verbose) {
1896
            System.out.println("    n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1897
            for (int j = 0; j < sizes.length; j++) {
1898
                System.out.println(dec5(j) + "\t" +
1899
                    dec5(sizes[j]) + "\t" +
1900
                    dec5(tables[j].length) + "\t" +
1901
                    dec5(shifts[j]) + "\t" +
1902
                    dec5(zeroextend[j]) + "\t" +
1903
                    dec5(bytes[j]) + "\t " +
1904
                    preshifted[j]);
1905
            }
1906
        }
1907
        if (verbose) {
1908
            System.out.println("Generating source code for class Character");
1909
            System.out.println("A table access looks like " +
1910
                         genAccess("A", "ch", (identifiers ? 2 : 32)));
1911
        }
1912
        generateCharacterClass(TemplateFileName, OutputFileName);
1913
    }
1914

1915
    /**
1916
    * The main program for generating source code for the Character class.
1917
    * The basic outline of its operation is:
1918
    * <ol>
1919
    * <li> Process the command line arguments.  One result of this process
1920
    *           is a list of sizes (measured in bits and summing to 16).
1921
    * <li> Get the Unicode character property data from the specification file.
1922
    * <li> From that, build a map that has, for each character code, its
1923
    *           relevant properties encoded as a long integer value.
1924
    * <li> Repeatedly compress the map, producing a compressed table and a
1925
    *           new map.  This is done once for each size value in the list.
1926
    *           When this is done, we have a set of tables.
1927
    * <li> Make some decisions about table representation; record these
1928
    *           decisions in arrays named preshifted, zeroextend, and bytes.
1929
    * <li> Generate the source code for the class Character by performing
1930
    *           macro processing on a template file.
1931
    * </ol>
1932
    *
1933
    * @param args       the command line arguments, as an array of String
1934
    *
1935
    * @see GenerateCharacter#processArgs
1936
    * @see UnicodeSpec@readSpecFile
1937
    * @see GenerateCharacter#buildMap
1938
    * @see GenerateCharacter#buildTable
1939
    * @see GenerateCharacter#generateCharacterClass
1940
    */
1941

1942
    public static void main(String[] args) {
1943
        processArgs(args);
1944
        try {
1945

1946
            UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1947
            specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1948
            PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1949

1950
            if (verbose) {
1951
                System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1952
            }
1953
            long[] map = buildMap(data, specialCaseMaps, propList);
1954
            if (verbose) {
1955
                System.err.println("Completed building of initial map");
1956
            }
1957

1958
            if (bins == 0) {
1959
                generateForSizes(map);
1960
            }
1961
            else {
1962
                while (bins > 0) {
1963
                    sizes = new int[bins];
1964
                    searchBins(map, 0);
1965
                    --bins;
1966
                }
1967
            }
1968
            if (verbose && false) {
1969
                System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
1970
                             hex8(maxOffsetSeen));
1971
                System.out.println("          allowed: -" + hex8(-minOffset) + "..+" +
1972
                             hex8(maxOffset));
1973
            }
1974
        }
1975
        catch (FileNotFoundException e) { FAIL(e.toString()); }
1976
        catch (IOException e) { FAIL(e.toString()); }
1977
        catch (Throwable e) {
1978
            System.out.println("Unexpected exception:");
1979
            e.printStackTrace();
1980
            FAIL("Unexpected exception!");
1981
        }
1982
        if (verbose) { System.out.println("Done!");}
1983
    }
1984

1985
}   // end class
1986

1987
Product

Resources

Company