Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/test/java/text/BreakIterator/BreakIteratorTest.java
38813 views
1
/*
2
* Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation.
8
*
9
* This code is distributed in the hope that it will be useful, but WITHOUT
10
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12
* version 2 for more details (a copy is included in the LICENSE file that
13
* accompanied this code).
14
*
15
* You should have received a copy of the GNU General Public License version
16
* 2 along with this work; if not, write to the Free Software Foundation,
17
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
*
19
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
* or visit www.oracle.com if you need additional information or have any
21
* questions.
22
*/
23
24
/*
25
* @test
26
* @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
27
* 4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
28
* 4152416 4153072 4158381 4214367 4217703 4638433
29
* @library /java/text/testlib
30
* @run main/timeout=2000 BreakIteratorTest
31
* @summary test BreakIterator
32
*/
33
34
/*
35
*
36
*
37
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
38
* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
39
*
40
* Portions copyright (c) 2007 Sun Microsystems, Inc.
41
* All Rights Reserved.
42
*
43
* The original version of this source code and documentation
44
* is copyrighted and owned by Taligent, Inc., a wholly-owned
45
* subsidiary of IBM. These materials are provided under terms
46
* of a License Agreement between Taligent and Sun. This technology
47
* is protected by multiple US and International patents.
48
*
49
* This notice and attribution to Taligent may not be removed.
50
* Taligent is a registered trademark of Taligent, Inc.
51
*
52
* Permission to use, copy, modify, and distribute this software
53
* and its documentation for NON-COMMERCIAL purposes and without
54
* fee is hereby granted provided that this copyright notice
55
* appears in all copies. Please refer to the file "copyright.html"
56
* for further important copyright and licensing information.
57
*
58
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
59
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
60
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
61
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
62
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
63
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
64
*
65
*/
66
67
import java.text.BreakIterator;
68
import java.text.CharacterIterator;
69
import java.text.StringCharacterIterator;
70
import java.util.Locale;
71
import java.util.Vector;
72
import java.util.Enumeration;
73
import java.io.*;
74
75
public class BreakIteratorTest extends IntlTest
76
{
77
private BreakIterator characterBreak;
78
private BreakIterator wordBreak;
79
private BreakIterator lineBreak;
80
private BreakIterator sentenceBreak;
81
82
public static void main(String[] args) throws Exception {
83
new BreakIteratorTest().run(args);
84
}
85
86
public BreakIteratorTest()
87
{
88
characterBreak = BreakIterator.getCharacterInstance();
89
wordBreak = BreakIterator.getWordInstance();
90
lineBreak = BreakIterator.getLineInstance();
91
sentenceBreak = BreakIterator.getSentenceInstance();
92
}
93
94
//=========================================================================
95
// general test subroutines
96
//=========================================================================
97
98
private void generalIteratorTest(BreakIterator bi, Vector expectedResult) {
99
StringBuffer buffer = new StringBuffer();
100
String text;
101
for (int i = 0; i < expectedResult.size(); i++) {
102
text = (String)expectedResult.elementAt(i);
103
buffer.append(text);
104
}
105
text = buffer.toString();
106
107
bi.setText(text);
108
109
Vector nextResults = testFirstAndNext(bi, text);
110
Vector previousResults = testLastAndPrevious(bi, text);
111
112
logln("comparing forward and backward...");
113
int errs = getErrorCount();
114
compareFragmentLists("forward iteration", "backward iteration", nextResults,
115
previousResults);
116
if (getErrorCount() == errs) {
117
logln("comparing expected and actual...");
118
compareFragmentLists("expected result", "actual result", expectedResult,
119
nextResults);
120
}
121
122
int[] boundaries = new int[expectedResult.size() + 3];
123
boundaries[0] = BreakIterator.DONE;
124
boundaries[1] = 0;
125
for (int i = 0; i < expectedResult.size(); i++)
126
boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)).
127
length();
128
boundaries[boundaries.length - 1] = BreakIterator.DONE;
129
130
testFollowing(bi, text, boundaries);
131
testPreceding(bi, text, boundaries);
132
testIsBoundary(bi, text, boundaries);
133
134
doMultipleSelectionTest(bi, text);
135
}
136
137
private Vector testFirstAndNext(BreakIterator bi, String text) {
138
int p = bi.first();
139
int lastP = p;
140
Vector<String> result = new Vector<String>();
141
142
if (p != 0)
143
errln("first() returned " + p + " instead of 0");
144
while (p != BreakIterator.DONE) {
145
p = bi.next();
146
if (p != BreakIterator.DONE) {
147
if (p <= lastP)
148
errln("next() failed to move forward: next() on position "
149
+ lastP + " yielded " + p);
150
151
result.addElement(text.substring(lastP, p));
152
}
153
else {
154
if (lastP != text.length())
155
errln("next() returned DONE prematurely: offset was "
156
+ lastP + " instead of " + text.length());
157
}
158
lastP = p;
159
}
160
return result;
161
}
162
163
private Vector testLastAndPrevious(BreakIterator bi, String text) {
164
int p = bi.last();
165
int lastP = p;
166
Vector<String> result = new Vector<String>();
167
168
if (p != text.length())
169
errln("last() returned " + p + " instead of " + text.length());
170
while (p != BreakIterator.DONE) {
171
p = bi.previous();
172
if (p != BreakIterator.DONE) {
173
if (p >= lastP)
174
errln("previous() failed to move backward: previous() on position "
175
+ lastP + " yielded " + p);
176
177
result.insertElementAt(text.substring(p, lastP), 0);
178
}
179
else {
180
if (lastP != 0)
181
errln("previous() returned DONE prematurely: offset was "
182
+ lastP + " instead of 0");
183
}
184
lastP = p;
185
}
186
return result;
187
}
188
189
private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) {
190
int p1 = 0;
191
int p2 = 0;
192
String s1;
193
String s2;
194
int t1 = 0;
195
int t2 = 0;
196
197
while (p1 < f1.size() && p2 < f2.size()) {
198
s1 = (String)f1.elementAt(p1);
199
s2 = (String)f2.elementAt(p2);
200
t1 += s1.length();
201
t2 += s2.length();
202
203
if (s1.equals(s2)) {
204
debugLogln(" >" + s1 + "<");
205
++p1;
206
++p2;
207
}
208
else {
209
int tempT1 = t1;
210
int tempT2 = t2;
211
int tempP1 = p1;
212
int tempP2 = p2;
213
214
while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
215
while (tempT1 < tempT2 && tempP1 < f1.size()) {
216
tempT1 += ((String)f1.elementAt(tempP1)).length();
217
++tempP1;
218
}
219
while (tempT2 < tempT1 && tempP2 < f2.size()) {
220
tempT2 += ((String)f2.elementAt(tempP2)).length();
221
++tempP2;
222
}
223
}
224
logln("*** " + f1Name + " has:");
225
while (p1 <= tempP1 && p1 < f1.size()) {
226
s1 = (String)f1.elementAt(p1);
227
t1 += s1.length();
228
debugLogln(" *** >" + s1 + "<");
229
++p1;
230
}
231
logln("***** " + f2Name + " has:");
232
while (p2 <= tempP2 && p2 < f2.size()) {
233
s2 = (String)f2.elementAt(p2);
234
t2 += s2.length();
235
debugLogln(" ***** >" + s2 + "<");
236
++p2;
237
}
238
errln("Discrepancy between " + f1Name + " and " + f2Name + "\n---\n" + f1 +"\n---\n" + f2);
239
}
240
}
241
}
242
243
private void testFollowing(BreakIterator bi, String text, int[] boundaries) {
244
logln("testFollowing():");
245
int p = 2;
246
int i = 0;
247
try {
248
for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
249
if (i == boundaries[p])
250
++p;
251
252
int b = bi.following(i);
253
logln("bi.following(" + i + ") -> " + b);
254
if (b != boundaries[p])
255
errln("Wrong result from following() for " + i + ": expected " + boundaries[p]
256
+ ", got " + b);
257
}
258
} catch (IllegalArgumentException illargExp) {
259
errln("IllegalArgumentException caught from following() for offset: " + i);
260
}
261
}
262
263
private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
264
logln("testPreceding():");
265
int p = 0;
266
int i = 0;
267
try {
268
for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
269
int b = bi.preceding(i);
270
logln("bi.preceding(" + i + ") -> " + b);
271
if (b != boundaries[p])
272
errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
273
+ ", got " + b);
274
275
if (i == boundaries[p + 1])
276
++p;
277
}
278
} catch (IllegalArgumentException illargExp) {
279
errln("IllegalArgumentException caught from preceding() for offset: " + i);
280
}
281
}
282
283
private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
284
logln("testIsBoundary():");
285
int p = 1;
286
boolean isB;
287
for (int i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
288
isB = bi.isBoundary(i);
289
logln("bi.isBoundary(" + i + ") -> " + isB);
290
291
if (i == boundaries[p]) {
292
if (!isB)
293
errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
294
++p;
295
}
296
else {
297
if (isB)
298
errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
299
}
300
}
301
}
302
303
private void doMultipleSelectionTest(BreakIterator iterator, String testText)
304
{
305
logln("Multiple selection test...");
306
BreakIterator testIterator = (BreakIterator)iterator.clone();
307
int offset = iterator.first();
308
int testOffset;
309
int count = 0;
310
311
do {
312
testOffset = testIterator.first();
313
testOffset = testIterator.next(count);
314
logln("next(" + count + ") -> " + testOffset);
315
if (offset != testOffset)
316
errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
317
318
if (offset != BreakIterator.DONE) {
319
count++;
320
offset = iterator.next();
321
}
322
} while (offset != BreakIterator.DONE);
323
324
// now do it backwards...
325
offset = iterator.last();
326
count = 0;
327
328
do {
329
testOffset = testIterator.last();
330
testOffset = testIterator.next(count);
331
logln("next(" + count + ") -> " + testOffset);
332
if (offset != testOffset)
333
errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
334
335
if (offset != BreakIterator.DONE) {
336
count--;
337
offset = iterator.previous();
338
}
339
} while (offset != BreakIterator.DONE);
340
}
341
342
private void doBreakInvariantTest(BreakIterator tb, String testChars)
343
{
344
StringBuffer work = new StringBuffer("aaa");
345
int errorCount = 0;
346
347
// a break should always occur after CR (unless followed by LF), LF, PS, and LS
348
String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028";
349
// change this back when new BI code is added
350
351
for (int i = 0; i < breaks.length(); i++) {
352
work.setCharAt(1, breaks.charAt(i));
353
for (int j = 0; j < testChars.length(); j++) {
354
work.setCharAt(0, testChars.charAt(j));
355
for (int k = 0; k < testChars.length(); k++) {
356
char c = testChars.charAt(k);
357
358
// if a cr is followed by lf, don't do the check (they stay together)
359
if (work.charAt(1) == '\r' && (c == '\n'))
360
continue;
361
362
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
363
// for breaking purposes as per UTR14
364
int type1 = Character.getType(work.charAt(1));
365
int type2 = Character.getType(c);
366
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
367
type2 == Character.CONTROL || type2 == Character.FORMAT) {
368
continue;
369
}
370
371
work.setCharAt(2, c);
372
tb.setText(work.toString());
373
boolean seen2 = false;
374
for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) {
375
if (l == 2)
376
seen2 = true;
377
}
378
if (!seen2) {
379
errln("No break between U+" + Integer.toHexString((int)(work.charAt(1)))
380
+ " and U+" + Integer.toHexString((int)(work.charAt(2))));
381
errorCount++;
382
if (errorCount >= 75)
383
return;
384
}
385
}
386
}
387
}
388
}
389
390
private void doOtherInvariantTest(BreakIterator tb, String testChars)
391
{
392
StringBuffer work = new StringBuffer("a\r\na");
393
int errorCount = 0;
394
395
// a break should never occur between CR and LF
396
for (int i = 0; i < testChars.length(); i++) {
397
work.setCharAt(0, testChars.charAt(i));
398
for (int j = 0; j < testChars.length(); j++) {
399
work.setCharAt(3, testChars.charAt(j));
400
tb.setText(work.toString());
401
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
402
if (k == 2) {
403
errln("Break between CR and LF in string U+" + Integer.toHexString(
404
(int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
405
(int)(work.charAt(3))));
406
errorCount++;
407
if (errorCount >= 75)
408
return;
409
}
410
}
411
}
412
413
// a break should never occur before a non-spacing mark, unless it's preceded
414
// by a line terminator
415
work.setLength(0);
416
work.append("aaaa");
417
for (int i = 0; i < testChars.length(); i++) {
418
char c = testChars.charAt(i);
419
if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
420
continue;
421
work.setCharAt(1, c);
422
for (int j = 0; j < testChars.length(); j++) {
423
c = testChars.charAt(j);
424
if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
425
!= Character.ENCLOSING_MARK)
426
continue;
427
work.setCharAt(2, c);
428
429
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
430
// for breaking purposes as per UTR14
431
int type1 = Character.getType(work.charAt(1));
432
int type2 = Character.getType(work.charAt(2));
433
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
434
type2 == Character.CONTROL || type2 == Character.FORMAT) {
435
continue;
436
}
437
438
tb.setText(work.toString());
439
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
440
if (k == 2) {
441
errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
442
+ " and U+" + Integer.toHexString((int)(work.charAt(2))));
443
errorCount++;
444
if (errorCount >= 75)
445
return;
446
}
447
}
448
}
449
}
450
451
public void debugLogln(String s) {
452
final String zeros = "0000";
453
String temp;
454
StringBuffer out = new StringBuffer();
455
for (int i = 0; i < s.length(); i++) {
456
char c = s.charAt(i);
457
if (c >= ' ' && c < '\u007f')
458
out.append(c);
459
else {
460
out.append("\\u");
461
temp = Integer.toHexString((int)c);
462
out.append(zeros.substring(0, 4 - temp.length()));
463
out.append(temp);
464
}
465
}
466
logln(out.toString());
467
}
468
469
//=========================================================================
470
// tests
471
//=========================================================================
472
473
public void TestWordBreak() {
474
475
Vector<String> wordSelectionData = new Vector<String>();
476
477
wordSelectionData.addElement("12,34");
478
479
wordSelectionData.addElement(" ");
480
wordSelectionData.addElement("\u00A2"); //cent sign
481
wordSelectionData.addElement("\u00A3"); //pound sign
482
wordSelectionData.addElement("\u00A4"); //currency sign
483
wordSelectionData.addElement("\u00A5"); //yen sign
484
wordSelectionData.addElement("alpha-beta-gamma");
485
wordSelectionData.addElement(".");
486
wordSelectionData.addElement(" ");
487
wordSelectionData.addElement("Badges");
488
wordSelectionData.addElement("?");
489
wordSelectionData.addElement(" ");
490
wordSelectionData.addElement("BADGES");
491
wordSelectionData.addElement("!");
492
wordSelectionData.addElement("?");
493
wordSelectionData.addElement("!");
494
wordSelectionData.addElement(" ");
495
wordSelectionData.addElement("We");
496
wordSelectionData.addElement(" ");
497
wordSelectionData.addElement("don't");
498
wordSelectionData.addElement(" ");
499
wordSelectionData.addElement("need");
500
wordSelectionData.addElement(" ");
501
wordSelectionData.addElement("no");
502
wordSelectionData.addElement(" ");
503
wordSelectionData.addElement("STINKING");
504
wordSelectionData.addElement(" ");
505
wordSelectionData.addElement("BADGES");
506
wordSelectionData.addElement("!");
507
wordSelectionData.addElement("!");
508
wordSelectionData.addElement("!");
509
510
wordSelectionData.addElement("012.566,5");
511
wordSelectionData.addElement(" ");
512
wordSelectionData.addElement("123.3434,900");
513
wordSelectionData.addElement(" ");
514
wordSelectionData.addElement("1000,233,456.000");
515
wordSelectionData.addElement(" ");
516
wordSelectionData.addElement("1,23.322%");
517
wordSelectionData.addElement(" ");
518
wordSelectionData.addElement("123.1222");
519
520
wordSelectionData.addElement(" ");
521
wordSelectionData.addElement("\u0024123,000.20");
522
523
wordSelectionData.addElement(" ");
524
wordSelectionData.addElement("179.01\u0025");
525
526
wordSelectionData.addElement("Hello");
527
wordSelectionData.addElement(",");
528
wordSelectionData.addElement(" ");
529
wordSelectionData.addElement("how");
530
wordSelectionData.addElement(" ");
531
wordSelectionData.addElement("are");
532
wordSelectionData.addElement(" ");
533
wordSelectionData.addElement("you");
534
wordSelectionData.addElement(" ");
535
wordSelectionData.addElement("X");
536
wordSelectionData.addElement(" ");
537
538
wordSelectionData.addElement("Now");
539
wordSelectionData.addElement("\r");
540
wordSelectionData.addElement("is");
541
wordSelectionData.addElement("\n");
542
wordSelectionData.addElement("the");
543
wordSelectionData.addElement("\r\n");
544
wordSelectionData.addElement("time");
545
wordSelectionData.addElement("\n");
546
wordSelectionData.addElement("\r");
547
wordSelectionData.addElement("for");
548
wordSelectionData.addElement("\r");
549
wordSelectionData.addElement("\r");
550
wordSelectionData.addElement("all");
551
wordSelectionData.addElement(" ");
552
553
generalIteratorTest(wordBreak, wordSelectionData);
554
}
555
556
public void TestBug4097779() {
557
Vector<String> wordSelectionData = new Vector<String>();
558
559
wordSelectionData.addElement("aa\u0300a");
560
wordSelectionData.addElement(" ");
561
562
generalIteratorTest(wordBreak, wordSelectionData);
563
}
564
565
public void TestBug4098467Words() {
566
Vector<String> wordSelectionData = new Vector<String>();
567
568
// What follows is a string of Korean characters (I found it in the Yellow Pages
569
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
570
// it correctly), first as precomposed syllables, and then as conjoining jamo.
571
// Both sequences should be semantically identical and break the same way.
572
// precomposed syllables...
573
wordSelectionData.addElement("\uc0c1\ud56d");
574
wordSelectionData.addElement(" ");
575
wordSelectionData.addElement("\ud55c\uc778");
576
wordSelectionData.addElement(" ");
577
wordSelectionData.addElement("\uc5f0\ud569");
578
wordSelectionData.addElement(" ");
579
wordSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c");
580
wordSelectionData.addElement(" ");
581
// conjoining jamo...
582
wordSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc");
583
wordSelectionData.addElement(" ");
584
wordSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab");
585
wordSelectionData.addElement(" ");
586
wordSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8");
587
wordSelectionData.addElement(" ");
588
wordSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
589
wordSelectionData.addElement(" ");
590
591
generalIteratorTest(wordBreak, wordSelectionData);
592
}
593
594
public void TestBug4117554Words() {
595
Vector<String> wordSelectionData = new Vector<String>();
596
597
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
598
// count as a Kanji character for the purposes of word breaking
599
wordSelectionData.addElement("abc");
600
wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
601
wordSelectionData.addElement("abc");
602
603
generalIteratorTest(wordBreak, wordSelectionData);
604
}
605
606
public void TestSentenceBreak() {
607
Vector<String> sentenceSelectionData = new Vector<String>();
608
609
sentenceSelectionData.addElement("This is a simple sample sentence. ");
610
sentenceSelectionData.addElement("(This is it.) ");
611
sentenceSelectionData.addElement("This is a simple sample sentence. ");
612
sentenceSelectionData.addElement("\"This isn\'t it.\" ");
613
sentenceSelectionData.addElement("Hi! ");
614
sentenceSelectionData.addElement("This is a simple sample sentence. ");
615
sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
616
sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
617
sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
618
sentenceSelectionData.addElement("He said, that I said, that you said!! ");
619
620
sentenceSelectionData.addElement("Don't rock the boat.\u2029");
621
622
sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
623
sentenceSelectionData.addElement("Not on my time (el timo.)! ");
624
625
sentenceSelectionData.addElement("So what!!\u2029");
626
627
sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
628
sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
629
sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
630
sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
631
sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
632
sentenceSelectionData.addElement("He answered, \"You may not!\" ");
633
sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
634
sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
635
sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
636
sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
637
638
generalIteratorTest(sentenceBreak, sentenceSelectionData);
639
}
640
641
public void TestBug4113835() {
642
Vector<String> sentenceSelectionData = new Vector<String>();
643
644
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
645
sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
646
647
generalIteratorTest(sentenceBreak, sentenceSelectionData);
648
}
649
650
public void TestBug4111338() {
651
Vector<String> sentenceSelectionData = new Vector<String>();
652
653
// test for bug #4111338: Don't break sentences at the boundary between CJK
654
// and other letters
655
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
656
+ "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
657
+ "\u611d\u57b6\u2510\u5d46\".\u2029");
658
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
659
+ "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
660
+ "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
661
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
662
+ "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
663
+ "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
664
sentenceSelectionData.addElement("He said, \"I can go there.\"\u2029");
665
666
generalIteratorTest(sentenceBreak, sentenceSelectionData);
667
}
668
669
public void TestBug4117554Sentences() {
670
Vector<String> sentenceSelectionData = new Vector<String>();
671
672
// Treat fullwidth variants of .!? the same as their
673
// normal counterparts
674
sentenceSelectionData.addElement("I know I'm right\uff0e ");
675
sentenceSelectionData.addElement("Right\uff1f ");
676
sentenceSelectionData.addElement("Right\uff01 ");
677
678
// Don't break sentences at boundary between CJK and digits
679
sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
680
+ "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
681
+ "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
682
683
// Break sentence between a sentence terminator and
684
// opening punctuation
685
sentenceSelectionData.addElement("no?");
686
sentenceSelectionData.addElement("(yes)");
687
688
generalIteratorTest(sentenceBreak, sentenceSelectionData);
689
}
690
691
public void TestBug4158381() {
692
Vector<String> sentenceSelectionData = new Vector<String>();
693
694
// Don't break sentence after period if it isn't followed by a space
695
sentenceSelectionData.addElement("Test <code>Flags.Flag</code> class. ");
696
sentenceSelectionData.addElement("Another test.\u2029");
697
698
// No breaks when there are no terminators around
699
sentenceSelectionData.addElement("<P>Provides a set of "
700
+ "&quot;lightweight&quot; (all-java<FONT SIZE=\"-2\"><SUP>TM"
701
+ "</SUP></FONT> language) components that, "
702
+ "to the maximum degree possible, work the same on all platforms. ");
703
sentenceSelectionData.addElement("Another test.\u2029");
704
705
generalIteratorTest(sentenceBreak, sentenceSelectionData);
706
}
707
708
public void TestBug4143071() {
709
Vector<String> sentenceSelectionData = new Vector<String>();
710
711
// Make sure sentences that end with digits work right
712
sentenceSelectionData.addElement("Today is the 27th of May, 1998. ");
713
sentenceSelectionData.addElement("Tomorrow with be 28 May 1998. ");
714
sentenceSelectionData.addElement("The day after will be the 30th.\u2029");
715
716
generalIteratorTest(sentenceBreak, sentenceSelectionData);
717
}
718
719
public void TestBug4152416() {
720
Vector<String> sentenceSelectionData = new Vector<String>();
721
722
// Make sure sentences ending with a capital letter are treated correctly
723
sentenceSelectionData.addElement("The type of all primitive "
724
+ "<code>boolean</code> values accessed in the target VM. ");
725
sentenceSelectionData.addElement("Calls to xxx will return an "
726
+ "implementor of this interface.\u2029");
727
728
generalIteratorTest(sentenceBreak, sentenceSelectionData);
729
}
730
731
public void TestBug4152117() {
732
Vector<String> sentenceSelectionData = new Vector<String>();
733
734
// Make sure sentence breaking is handling punctuation correctly
735
// [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
736
// IT DOESN'T CROP UP]
737
sentenceSelectionData.addElement("Constructs a randomly generated "
738
+ "BigInteger, uniformly distributed over the range <tt>0</tt> "
739
+ "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. ");
740
sentenceSelectionData.addElement("The uniformity of the distribution "
741
+ "assumes that a fair source of random bits is provided in "
742
+ "<tt>rnd</tt>. ");
743
sentenceSelectionData.addElement("Note that this constructor always "
744
+ "constructs a non-negative BigInteger.\u2029");
745
746
generalIteratorTest(sentenceBreak, sentenceSelectionData);
747
}
748
749
public void TestLineBreak() {
750
Vector<String> lineSelectionData = new Vector<String>();
751
752
lineSelectionData.addElement("Multi-");
753
lineSelectionData.addElement("Level ");
754
lineSelectionData.addElement("example ");
755
lineSelectionData.addElement("of ");
756
lineSelectionData.addElement("a ");
757
lineSelectionData.addElement("semi-");
758
lineSelectionData.addElement("idiotic ");
759
lineSelectionData.addElement("non-");
760
lineSelectionData.addElement("sensical ");
761
lineSelectionData.addElement("(non-");
762
lineSelectionData.addElement("important) ");
763
lineSelectionData.addElement("sentence. ");
764
765
lineSelectionData.addElement("Hi ");
766
lineSelectionData.addElement("Hello ");
767
lineSelectionData.addElement("How\n");
768
lineSelectionData.addElement("are\r");
769
lineSelectionData.addElement("you\u2028");
770
lineSelectionData.addElement("fine.\t");
771
lineSelectionData.addElement("good. ");
772
773
lineSelectionData.addElement("Now\r");
774
lineSelectionData.addElement("is\n");
775
lineSelectionData.addElement("the\r\n");
776
lineSelectionData.addElement("time\n");
777
lineSelectionData.addElement("\r");
778
lineSelectionData.addElement("for\r");
779
lineSelectionData.addElement("\r");
780
lineSelectionData.addElement("all");
781
782
generalIteratorTest(lineBreak, lineSelectionData);
783
}
784
785
public void TestBug4068133() {
786
Vector<String> lineSelectionData = new Vector<String>();
787
788
lineSelectionData.addElement("\u96f6");
789
lineSelectionData.addElement("\u4e00\u3002");
790
lineSelectionData.addElement("\u4e8c\u3001");
791
lineSelectionData.addElement("\u4e09\u3002\u3001");
792
lineSelectionData.addElement("\u56db\u3001\u3002\u3001");
793
lineSelectionData.addElement("\u4e94,");
794
lineSelectionData.addElement("\u516d.");
795
lineSelectionData.addElement("\u4e03.\u3001,\u3002");
796
lineSelectionData.addElement("\u516b");
797
798
generalIteratorTest(lineBreak, lineSelectionData);
799
}
800
801
public void TestBug4086052() {
802
Vector<String> lineSelectionData = new Vector<String>();
803
804
lineSelectionData.addElement("foo\u00a0bar ");
805
// lineSelectionData.addElement("foo\ufeffbar");
806
807
generalIteratorTest(lineBreak, lineSelectionData);
808
}
809
810
public void TestBug4097920() {
811
Vector<String> lineSelectionData = new Vector<String>();
812
813
lineSelectionData.addElement("dog,");
814
lineSelectionData.addElement("cat,");
815
lineSelectionData.addElement("mouse ");
816
lineSelectionData.addElement("(one)");
817
lineSelectionData.addElement("(two)\n");
818
819
generalIteratorTest(lineBreak, lineSelectionData);
820
}
821
/*
822
public void TestBug4035266() {
823
Vector<String> lineSelectionData = new Vector<String>();
824
825
lineSelectionData.addElement("The ");
826
lineSelectionData.addElement("balance ");
827
lineSelectionData.addElement("is ");
828
lineSelectionData.addElement("$-23,456.78, ");
829
lineSelectionData.addElement("not ");
830
lineSelectionData.addElement("-$32,456.78!\n");
831
832
generalIteratorTest(lineBreak, lineSelectionData);
833
}
834
*/
835
public void TestBug4098467Lines() {
836
Vector<String> lineSelectionData = new Vector<String>();
837
838
// What follows is a string of Korean characters (I found it in the Yellow Pages
839
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
840
// it correctly), first as precomposed syllables, and then as conjoining jamo.
841
// Both sequences should be semantically identical and break the same way.
842
// precomposed syllables...
843
lineSelectionData.addElement("\uc0c1");
844
lineSelectionData.addElement("\ud56d ");
845
lineSelectionData.addElement("\ud55c");
846
lineSelectionData.addElement("\uc778 ");
847
lineSelectionData.addElement("\uc5f0");
848
lineSelectionData.addElement("\ud569 ");
849
lineSelectionData.addElement("\uc7a5");
850
lineSelectionData.addElement("\ub85c");
851
lineSelectionData.addElement("\uad50");
852
lineSelectionData.addElement("\ud68c ");
853
// conjoining jamo...
854
lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
855
lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
856
lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
857
lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
858
859
if (Locale.getDefault().getLanguage().equals("th")) {
860
logln("This test is skipped in th locale.");
861
return;
862
}
863
864
generalIteratorTest(lineBreak, lineSelectionData);
865
}
866
867
public void TestBug4117554Lines() {
868
Vector<String> lineSelectionData = new Vector<String>();
869
870
// Fullwidth .!? should be treated as postJwrd
871
lineSelectionData.addElement("\u4e01\uff0e");
872
lineSelectionData.addElement("\u4e02\uff01");
873
lineSelectionData.addElement("\u4e03\uff1f");
874
875
generalIteratorTest(lineBreak, lineSelectionData);
876
}
877
878
public void TestBug4217703() {
879
if (Locale.getDefault().getLanguage().equals("th")) {
880
logln("This test is skipped in th locale.");
881
return;
882
}
883
884
Vector<String> lineSelectionData = new Vector<String>();
885
886
// There shouldn't be a line break between sentence-ending punctuation
887
// and a closing quote
888
lineSelectionData.addElement("He ");
889
lineSelectionData.addElement("said ");
890
lineSelectionData.addElement("\"Go!\" ");
891
lineSelectionData.addElement("I ");
892
lineSelectionData.addElement("went. ");
893
894
lineSelectionData.addElement("Hashtable$Enumeration ");
895
lineSelectionData.addElement("getText().");
896
lineSelectionData.addElement("getIndex()");
897
898
generalIteratorTest(lineBreak, lineSelectionData);
899
}
900
901
private static final String graveS = "S\u0300";
902
private static final String acuteBelowI = "i\u0317";
903
private static final String acuteE = "e\u0301";
904
private static final String circumflexA = "a\u0302";
905
private static final String tildeE = "e\u0303";
906
907
public void TestCharacterBreak() {
908
Vector<String> characterSelectionData = new Vector<String>();
909
910
characterSelectionData.addElement(graveS);
911
characterSelectionData.addElement(acuteBelowI);
912
characterSelectionData.addElement("m");
913
characterSelectionData.addElement("p");
914
characterSelectionData.addElement("l");
915
characterSelectionData.addElement(acuteE);
916
characterSelectionData.addElement(" ");
917
characterSelectionData.addElement("s");
918
characterSelectionData.addElement(circumflexA);
919
characterSelectionData.addElement("m");
920
characterSelectionData.addElement("p");
921
characterSelectionData.addElement("l");
922
characterSelectionData.addElement(tildeE);
923
characterSelectionData.addElement(".");
924
characterSelectionData.addElement("w");
925
characterSelectionData.addElement(circumflexA);
926
characterSelectionData.addElement("w");
927
characterSelectionData.addElement("a");
928
characterSelectionData.addElement("f");
929
characterSelectionData.addElement("q");
930
characterSelectionData.addElement("\n");
931
characterSelectionData.addElement("\r");
932
characterSelectionData.addElement("\r\n");
933
characterSelectionData.addElement("\n");
934
935
generalIteratorTest(characterBreak, characterSelectionData);
936
}
937
938
public void TestBug4098467Characters() {
939
Vector<String> characterSelectionData = new Vector<String>();
940
941
// What follows is a string of Korean characters (I found it in the Yellow Pages
942
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
943
// it correctly), first as precomposed syllables, and then as conjoining jamo.
944
// Both sequences should be semantically identical and break the same way.
945
// precomposed syllables...
946
characterSelectionData.addElement("\uc0c1");
947
characterSelectionData.addElement("\ud56d");
948
characterSelectionData.addElement(" ");
949
characterSelectionData.addElement("\ud55c");
950
characterSelectionData.addElement("\uc778");
951
characterSelectionData.addElement(" ");
952
characterSelectionData.addElement("\uc5f0");
953
characterSelectionData.addElement("\ud569");
954
characterSelectionData.addElement(" ");
955
characterSelectionData.addElement("\uc7a5");
956
characterSelectionData.addElement("\ub85c");
957
characterSelectionData.addElement("\uad50");
958
characterSelectionData.addElement("\ud68c");
959
characterSelectionData.addElement(" ");
960
// conjoining jamo...
961
characterSelectionData.addElement("\u1109\u1161\u11bc");
962
characterSelectionData.addElement("\u1112\u1161\u11bc");
963
characterSelectionData.addElement(" ");
964
characterSelectionData.addElement("\u1112\u1161\u11ab");
965
characterSelectionData.addElement("\u110b\u1175\u11ab");
966
characterSelectionData.addElement(" ");
967
characterSelectionData.addElement("\u110b\u1167\u11ab");
968
characterSelectionData.addElement("\u1112\u1161\u11b8");
969
characterSelectionData.addElement(" ");
970
characterSelectionData.addElement("\u110c\u1161\u11bc");
971
characterSelectionData.addElement("\u1105\u1169");
972
characterSelectionData.addElement("\u1100\u116d");
973
characterSelectionData.addElement("\u1112\u116c");
974
975
generalIteratorTest(characterBreak, characterSelectionData);
976
}
977
978
public void TestBug4153072() {
979
BreakIterator iter = BreakIterator.getWordInstance();
980
String str = "...Hello, World!...";
981
int begin = 3;
982
int end = str.length() - 3;
983
boolean gotException = false;
984
boolean dummy;
985
986
iter.setText(new StringCharacterIterator(str, begin, end, begin));
987
for (int index = -1; index < begin + 1; ++index) {
988
try {
989
dummy = iter.isBoundary(index);
990
if (index < begin)
991
errln("Didn't get exception with offset = " + index +
992
" and begin index = " + begin);
993
}
994
catch (IllegalArgumentException e) {
995
if (index >= begin)
996
errln("Got exception with offset = " + index +
997
" and begin index = " + begin);
998
}
999
}
1000
}
1001
1002
public void TestBug4146175Sentences() {
1003
Vector<String> sentenceSelectionData = new Vector<String>();
1004
1005
// break between periods and opening punctuation even when there's no
1006
// intervening space
1007
sentenceSelectionData.addElement("end.");
1008
sentenceSelectionData.addElement("(This is\u2029");
1009
1010
// treat the fullwidth period as an unambiguous sentence terminator
1011
sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
1012
sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
1013
1014
generalIteratorTest(sentenceBreak, sentenceSelectionData);
1015
}
1016
1017
public void TestBug4146175Lines() {
1018
if (Locale.getDefault().getLanguage().equals("th")) {
1019
logln("This test is skipped in th locale.");
1020
return;
1021
}
1022
1023
Vector<String> lineSelectionData = new Vector<String>();
1024
1025
// the fullwidth comma should stick to the preceding Japanese character
1026
lineSelectionData.addElement("\u7d42\uff0c");
1027
lineSelectionData.addElement("\u308f");
1028
1029
generalIteratorTest(lineBreak, lineSelectionData);
1030
}
1031
1032
public void TestBug4214367() {
1033
if (Locale.getDefault().getLanguage().equals("th")) {
1034
logln("This test is skipped in th locale.");
1035
return;
1036
}
1037
1038
Vector<String> wordSelectionData = new Vector<String>();
1039
1040
// the hiragana and katakana iteration marks and the long vowel mark
1041
// are not being treated correctly by the word-break iterator
1042
wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
1043
wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
1044
1045
generalIteratorTest(wordBreak, wordSelectionData);
1046
}
1047
1048
private static final String cannedTestChars // characters fo the class Cc are ignorable for breaking
1049
= /*"\u0000\u0001\u0002\u0003\u0004*/" !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
1050
+ "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
1051
+ "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
1052
+ "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
1053
+ "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
1054
+ "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
1055
1056
public void TestSentenceInvariants()
1057
{
1058
BreakIterator e = BreakIterator.getSentenceInstance();
1059
doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
1060
}
1061
1062
public void TestWordInvariants()
1063
{
1064
if (Locale.getDefault().getLanguage().equals("th")) {
1065
logln("This test is skipped in th locale.");
1066
return;
1067
}
1068
1069
BreakIterator e = BreakIterator.getWordInstance();
1070
doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1071
+ "\u30a3\u4e00\u4e01\u4e02");
1072
doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1073
+ "\u30a3\u4e00\u4e01\u4e02");
1074
}
1075
1076
public void TestLineInvariants()
1077
{
1078
if (Locale.getDefault().getLanguage().equals("th")) {
1079
logln("This test is skipped in th locale.");
1080
return;
1081
}
1082
1083
BreakIterator e = BreakIterator.getLineInstance();
1084
String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
1085
+ "\u30a3\u4e00\u4e01\u4e02";
1086
doBreakInvariantTest(e, testChars);
1087
doOtherInvariantTest(e, testChars);
1088
1089
int errorCount = 0;
1090
1091
// in addition to the other invariants, a line-break iterator should make sure that:
1092
// it doesn't break around the non-breaking characters
1093
String noBreak = "\u00a0\u2007\u2011\ufeff";
1094
StringBuffer work = new StringBuffer("aaa");
1095
for (int i = 0; i < testChars.length(); i++) {
1096
char c = testChars.charAt(i);
1097
if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
1098
continue;
1099
work.setCharAt(0, c);
1100
for (int j = 0; j < noBreak.length(); j++) {
1101
work.setCharAt(1, noBreak.charAt(j));
1102
for (int k = 0; k < testChars.length(); k++) {
1103
work.setCharAt(2, testChars.charAt(k));
1104
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
1105
// for breaking purposes as per UTR14
1106
int type1 = Character.getType(work.charAt(1));
1107
int type2 = Character.getType(work.charAt(2));
1108
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
1109
type2 == Character.CONTROL || type2 == Character.FORMAT) {
1110
continue;
1111
}
1112
e.setText(work.toString());
1113
for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) {
1114
if (l == 1 || l == 2) {
1115
//errln("Got break between U+" + Integer.toHexString((int)
1116
// (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1117
// (int)(work.charAt(l))) + "\ntype1 = " + type1 + "\ntype2 = " + type2);
1118
// as per UTR14 spaces followed by a GLUE character should allow
1119
// line breaking
1120
if (work.charAt(l-1) == '\u0020' && (work.charAt(l) == '\u00a0' ||
1121
work.charAt(l) == '\u0f0c' ||
1122
work.charAt(l) == '\u2007' ||
1123
work.charAt(l) == '\u2011' ||
1124
work.charAt(l) == '\u202f' ||
1125
work.charAt(l) == '\ufeff')) {
1126
continue;
1127
}
1128
errln("Got break between U+" + Integer.toHexString((int)
1129
(work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1130
(int)(work.charAt(l))));
1131
errorCount++;
1132
if (errorCount >= 75)
1133
return;
1134
}
1135
}
1136
}
1137
}
1138
}
1139
1140
// The following test has so many exceptions that it would be better to write a new set of data
1141
// that tested exactly what should be tested
1142
// Until that point it will be commented out
1143
/*
1144
1145
// it does break after dashes (unless they're followed by a digit, a non-spacing mark,
1146
// a currency symbol, a space, a format-control character, a regular control character,
1147
// a line or paragraph separator, or another dash)
1148
String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
1149
for (int i = 0; i < testChars.length(); i++) {
1150
work.setCharAt(0, testChars.charAt(i));
1151
for (int j = 0; j < dashes.length(); j++) {
1152
work.setCharAt(1, dashes.charAt(j));
1153
for (int k = 0; k < testChars.length(); k++) {
1154
char c = testChars.charAt(k);
1155
if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
1156
Character.getType(c) == Character.OTHER_NUMBER ||
1157
Character.getType(c) == Character.NON_SPACING_MARK ||
1158
Character.getType(c) == Character.ENCLOSING_MARK ||
1159
Character.getType(c) == Character.CURRENCY_SYMBOL ||
1160
Character.getType(c) == Character.DASH_PUNCTUATION ||
1161
Character.getType(c) == Character.SPACE_SEPARATOR ||
1162
Character.getType(c) == Character.FORMAT ||
1163
Character.getType(c) == Character.CONTROL ||
1164
Character.getType(c) == Character.END_PUNCTUATION ||
1165
Character.getType(c) == Character.FINAL_QUOTE_PUNCTUATION ||
1166
Character.getType(c) == Character.OTHER_PUNCTUATION ||
1167
c == '\'' || c == '\"' ||
1168
// category EX as per UTR14
1169
c == '!' || c == '?' || c == '\ufe56' || c == '\ufe57' || c == '\uff01' || c == '\uff1f' ||
1170
c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
1171
c == '\u0003' || c == '\u2007' || c == '\u2011' ||
1172
c == '\ufeff')
1173
continue;
1174
work.setCharAt(2, c);
1175
e.setText(work.toString());
1176
boolean saw2 = false;
1177
for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
1178
if (l == 2)
1179
saw2 = true;
1180
if (!saw2) {
1181
errln("Didn't get break between U+" + Integer.toHexString((int)
1182
(work.charAt(1))) + " and U+" + Integer.toHexString(
1183
(int)(work.charAt(2))));
1184
errorCount++;
1185
if (errorCount >= 75)
1186
return;
1187
}
1188
}
1189
}
1190
}
1191
*/
1192
}
1193
1194
public void TestCharacterInvariants()
1195
{
1196
BreakIterator e = BreakIterator.getCharacterInstance();
1197
doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1198
+ "\u11a9\u11aa");
1199
doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1200
+ "\u11a9\u11aa");
1201
}
1202
1203
public void TestEmptyString()
1204
{
1205
String text = "";
1206
Vector<String> x = new Vector<String>();
1207
x.addElement(text);
1208
1209
generalIteratorTest(lineBreak, x);
1210
}
1211
1212
public void TestGetAvailableLocales()
1213
{
1214
Locale[] locList = BreakIterator.getAvailableLocales();
1215
1216
if (locList.length == 0)
1217
errln("getAvailableLocales() returned an empty list!");
1218
// I have no idea how to test this function...
1219
}
1220
1221
1222
/**
1223
* Bug 4095322
1224
*/
1225
public void TestJapaneseLineBreak()
1226
{
1227
StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
1228
// Breaking on <Kanji>$<Kanji> is inconsistent
1229
1230
/* Characters in precedingChars and followingChars have been updated
1231
* from Unicode 2.0.14-based to 3.0.0-based when 4638433 was fixed.
1232
* In concrete terms,
1233
* 0x301F : Its category was changed from Ps to Pe since Unicode 2.1.
1234
* 0x169B & 0x169C : added since Unicode 3.0.0.
1235
*/
1236
String precedingChars =
1237
/* Puctuation, Open */
1238
"([{\u201a\u201e\u2045\u207d\u208d\u2329\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff62\u169b"
1239
/* Punctuation, Initial quote */
1240
+ "\u00ab\u2018\u201b\u201c\u201f\u2039"
1241
/* Symbol, Currency */
1242
+ "\u00a5\u00a3\u00a4\u20a0";
1243
1244
String followingChars =
1245
/* Puctuation, Close */
1246
")]}\u2046\u207e\u208e\u232a\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e\u301f\ufd3e\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42\ufe44\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff63\u169c"
1247
/* Punctuation, Final quote */
1248
+ "\u00bb\u2019\u201d\u203a"
1249
/* Punctuation, Other */
1250
+ "!%,.:;\u3001\u3002\u2030\u2031\u2032\u2033\u2034"
1251
/* Punctuation, Dash */
1252
+ "\u2103\u2109"
1253
/* Symbol, Currency */
1254
+ "\u00a2"
1255
/* Letter, Modifier */
1256
+ "\u3005\u309d\u309e"
1257
/* Letter, Other */
1258
+ "\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc\u30fd\u30fe"
1259
/* Mark, Non-Spacing */
1260
+ "\u0300\u0301\u0302"
1261
/* Symbol, Modifier */
1262
+ "\u309b\u309c"
1263
/* Symbol, Other */
1264
+ "\u00b0";
1265
1266
BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
1267
1268
for (int i = 0; i < precedingChars.length(); i++) {
1269
testString.setCharAt(1, precedingChars.charAt(i));
1270
iter.setText(testString.toString());
1271
int j = iter.first();
1272
if (j != 0) {
1273
errln("ja line break failure: failed to start at 0 and bounced at " + j);
1274
}
1275
j = iter.next();
1276
if (j != 1) {
1277
errln("ja line break failure: failed to stop before '"
1278
+ precedingChars.charAt(i) + "' (\\u"
1279
+ Integer.toString(precedingChars.charAt(i), 16)
1280
+ ") at 1 and bounded at " + j);
1281
}
1282
j = iter.next();
1283
if (j != 3) {
1284
errln("ja line break failure: failed to skip position after '"
1285
+ precedingChars.charAt(i) + "' (\\u"
1286
+ Integer.toString(precedingChars.charAt(i), 16)
1287
+ ") at 3 and bounded at " + j);
1288
}
1289
}
1290
1291
for (int i = 0; i < followingChars.length(); i++) {
1292
testString.setCharAt(1, followingChars.charAt(i));
1293
iter.setText(testString.toString());
1294
int j = iter.first();
1295
if (j != 0) {
1296
errln("ja line break failure: failed to start at 0 and bounded at " + j);
1297
}
1298
j = iter.next();
1299
if (j != 2) {
1300
errln("ja line break failure: failed to skip position before '"
1301
+ followingChars.charAt(i) + "' (\\u"
1302
+ Integer.toString(followingChars.charAt(i), 16)
1303
+ ") at 2 and bounded at " + j);
1304
}
1305
j = iter.next();
1306
if (j != 3) {
1307
errln("ja line break failure: failed to stop after '"
1308
+ followingChars.charAt(i) + "' (\\u"
1309
+ Integer.toString(followingChars.charAt(i), 16)
1310
+ ") at 3 and bounded at " + j);
1311
}
1312
}
1313
}
1314
1315
/**
1316
* Bug 4638433
1317
*/
1318
public void TestLineBreakBasedOnUnicode3_0_0()
1319
{
1320
BreakIterator iter;
1321
int i;
1322
1323
/* Latin Extend-B characters
1324
* 0x0218-0x0233 which have been added since Unicode 3.0.0.
1325
*/
1326
iter = BreakIterator.getWordInstance(Locale.US);
1327
iter.setText("\u0216\u0217\u0218\u0219\u021A");
1328
i = iter.first();
1329
i = iter.next();
1330
if (i != 5) {
1331
errln("Word break failure: failed to stop at 5 and bounded at " + i);
1332
}
1333
1334
1335
iter = BreakIterator.getLineInstance(Locale.US);
1336
1337
/* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)>
1338
* \u301f has changed its category from Ps to Pe since Unicode 2.1.
1339
*/
1340
iter.setText("32\u301f1");
1341
i = iter.first();
1342
i = iter.next();
1343
if (i != 3) {
1344
errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
1345
}
1346
1347
/* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)>
1348
* which have been added since Unicode 3.0.0.
1349
*/
1350
iter.setText("\u1820\u1806\u1821");
1351
i = iter.first();
1352
i = iter.next();
1353
if (i != 2) {
1354
errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
1355
}
1356
1357
/* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have
1358
* been added since Unicode 3.0.0.
1359
*/
1360
iter.setText("\u17E0\u17DB\u17E1");
1361
i = iter.first();
1362
i = iter.next();
1363
if (i != 1) {
1364
errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
1365
}
1366
i = iter.next();
1367
if (i != 3) {
1368
errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
1369
}
1370
1371
/* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have
1372
* been added since Unicode 3.0.0.
1373
*/
1374
iter.setText("\u1692\u1680\u1696");
1375
i = iter.first();
1376
i = iter.next();
1377
if (i != 2) {
1378
errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
1379
}
1380
1381
1382
// Confirm changes in BreakIteratorRules_th.java have been reflected.
1383
iter = BreakIterator.getLineInstance(new Locale("th", ""));
1384
1385
/* Thai <Seven(Nd)>
1386
* <Left Double Quotation Mark(Pi)>
1387
* <Five(Nd)>
1388
* <Right Double Quotation Mark(Pf)>
1389
* <Three(Nd)>
1390
*/
1391
iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
1392
i = iter.first();
1393
i = iter.next();
1394
if (i != 1) {
1395
errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
1396
}
1397
i = iter.next();
1398
if (i != 4) {
1399
errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
1400
}
1401
}
1402
1403
/**
1404
* Bug 4068137
1405
*/
1406
public void TestEndBehavior()
1407
{
1408
String testString = "boo.";
1409
BreakIterator wb = BreakIterator.getWordInstance();
1410
wb.setText(testString);
1411
1412
if (wb.first() != 0)
1413
errln("Didn't get break at beginning of string.");
1414
if (wb.next() != 3)
1415
errln("Didn't get break before period in \"boo.\"");
1416
if (wb.current() != 4 && wb.next() != 4)
1417
errln("Didn't get break at end of string.");
1418
}
1419
1420
// [serialization test has been removed pursuant to bug #4152965]
1421
1422
/**
1423
* Bug 4450804
1424
*/
1425
public void TestLineBreakContractions() {
1426
Vector<String> expected = new Vector<String>();
1427
1428
expected.add("These ");
1429
expected.add("are ");
1430
expected.add("'foobles'. ");
1431
expected.add("Don't ");
1432
expected.add("you ");
1433
expected.add("like ");
1434
expected.add("them?");
1435
generalIteratorTest(lineBreak, expected);
1436
}
1437
1438
}
1439
1440