Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
80684 views
1
module.exports = Tokenizer;
2
3
var decodeCodePoint = require("entities/lib/decode_codepoint.js"),
4
entityMap = require("entities/maps/entities.json"),
5
legacyMap = require("entities/maps/legacy.json"),
6
xmlMap = require("entities/maps/xml.json"),
7
8
i = 0,
9
10
TEXT = i++,
11
BEFORE_TAG_NAME = i++, //after <
12
IN_TAG_NAME = i++,
13
IN_SELF_CLOSING_TAG = i++,
14
BEFORE_CLOSING_TAG_NAME = i++,
15
IN_CLOSING_TAG_NAME = i++,
16
AFTER_CLOSING_TAG_NAME = i++,
17
18
//attributes
19
BEFORE_ATTRIBUTE_NAME = i++,
20
IN_ATTRIBUTE_NAME = i++,
21
AFTER_ATTRIBUTE_NAME = i++,
22
BEFORE_ATTRIBUTE_VALUE = i++,
23
IN_ATTRIBUTE_VALUE_DQ = i++, // "
24
IN_ATTRIBUTE_VALUE_SQ = i++, // '
25
IN_ATTRIBUTE_VALUE_NQ = i++,
26
27
//declarations
28
BEFORE_DECLARATION = i++, // !
29
IN_DECLARATION = i++,
30
31
//processing instructions
32
IN_PROCESSING_INSTRUCTION = i++, // ?
33
34
//comments
35
BEFORE_COMMENT = i++,
36
IN_COMMENT = i++,
37
AFTER_COMMENT_1 = i++,
38
AFTER_COMMENT_2 = i++,
39
40
//cdata
41
BEFORE_CDATA_1 = i++, // [
42
BEFORE_CDATA_2 = i++, // C
43
BEFORE_CDATA_3 = i++, // D
44
BEFORE_CDATA_4 = i++, // A
45
BEFORE_CDATA_5 = i++, // T
46
BEFORE_CDATA_6 = i++, // A
47
IN_CDATA = i++, // [
48
AFTER_CDATA_1 = i++, // ]
49
AFTER_CDATA_2 = i++, // ]
50
51
//special tags
52
BEFORE_SPECIAL = i++, //S
53
BEFORE_SPECIAL_END = i++, //S
54
55
BEFORE_SCRIPT_1 = i++, //C
56
BEFORE_SCRIPT_2 = i++, //R
57
BEFORE_SCRIPT_3 = i++, //I
58
BEFORE_SCRIPT_4 = i++, //P
59
BEFORE_SCRIPT_5 = i++, //T
60
AFTER_SCRIPT_1 = i++, //C
61
AFTER_SCRIPT_2 = i++, //R
62
AFTER_SCRIPT_3 = i++, //I
63
AFTER_SCRIPT_4 = i++, //P
64
AFTER_SCRIPT_5 = i++, //T
65
66
BEFORE_STYLE_1 = i++, //T
67
BEFORE_STYLE_2 = i++, //Y
68
BEFORE_STYLE_3 = i++, //L
69
BEFORE_STYLE_4 = i++, //E
70
AFTER_STYLE_1 = i++, //T
71
AFTER_STYLE_2 = i++, //Y
72
AFTER_STYLE_3 = i++, //L
73
AFTER_STYLE_4 = i++, //E
74
75
BEFORE_ENTITY = i++, //&
76
BEFORE_NUMERIC_ENTITY = i++, //#
77
IN_NAMED_ENTITY = i++,
78
IN_NUMERIC_ENTITY = i++,
79
IN_HEX_ENTITY = i++, //X
80
81
j = 0,
82
83
SPECIAL_NONE = j++,
84
SPECIAL_SCRIPT = j++,
85
SPECIAL_STYLE = j++;
86
87
function whitespace(c){
88
return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
89
}
90
91
function characterState(char, SUCCESS){
92
return function(c){
93
if(c === char) this._state = SUCCESS;
94
};
95
}
96
97
function ifElseState(upper, SUCCESS, FAILURE){
98
var lower = upper.toLowerCase();
99
100
if(upper === lower){
101
return function(c){
102
if(c === lower){
103
this._state = SUCCESS;
104
} else {
105
this._state = FAILURE;
106
this._index--;
107
}
108
};
109
} else {
110
return function(c){
111
if(c === lower || c === upper){
112
this._state = SUCCESS;
113
} else {
114
this._state = FAILURE;
115
this._index--;
116
}
117
};
118
}
119
}
120
121
function consumeSpecialNameChar(upper, NEXT_STATE){
122
var lower = upper.toLowerCase();
123
124
return function(c){
125
if(c === lower || c === upper){
126
this._state = NEXT_STATE;
127
} else {
128
this._state = IN_TAG_NAME;
129
this._index--; //consume the token again
130
}
131
};
132
}
133
134
function Tokenizer(options, cbs){
135
this._state = TEXT;
136
this._buffer = "";
137
this._sectionStart = 0;
138
this._index = 0;
139
this._bufferOffset = 0; //chars removed from _buffer
140
this._baseState = TEXT;
141
this._special = SPECIAL_NONE;
142
this._cbs = cbs;
143
this._running = true;
144
this._ended = false;
145
this._xmlMode = !!(options && options.xmlMode);
146
this._decodeEntities = !!(options && options.decodeEntities);
147
}
148
149
Tokenizer.prototype._stateText = function(c){
150
if(c === "<"){
151
if(this._index > this._sectionStart){
152
this._cbs.ontext(this._getSection());
153
}
154
this._state = BEFORE_TAG_NAME;
155
this._sectionStart = this._index;
156
} else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
157
if(this._index > this._sectionStart){
158
this._cbs.ontext(this._getSection());
159
}
160
this._baseState = TEXT;
161
this._state = BEFORE_ENTITY;
162
this._sectionStart = this._index;
163
}
164
};
165
166
Tokenizer.prototype._stateBeforeTagName = function(c){
167
if(c === "/"){
168
this._state = BEFORE_CLOSING_TAG_NAME;
169
} else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) {
170
this._state = TEXT;
171
} else if(c === "!"){
172
this._state = BEFORE_DECLARATION;
173
this._sectionStart = this._index + 1;
174
} else if(c === "?"){
175
this._state = IN_PROCESSING_INSTRUCTION;
176
this._sectionStart = this._index + 1;
177
} else if(c === "<"){
178
this._cbs.ontext(this._getSection());
179
this._sectionStart = this._index;
180
} else {
181
this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
182
BEFORE_SPECIAL : IN_TAG_NAME;
183
this._sectionStart = this._index;
184
}
185
};
186
187
Tokenizer.prototype._stateInTagName = function(c){
188
if(c === "/" || c === ">" || whitespace(c)){
189
this._emitToken("onopentagname");
190
this._state = BEFORE_ATTRIBUTE_NAME;
191
this._index--;
192
}
193
};
194
195
Tokenizer.prototype._stateBeforeCloseingTagName = function(c){
196
if(whitespace(c));
197
else if(c === ">"){
198
this._state = TEXT;
199
} else if(this._special !== SPECIAL_NONE){
200
if(c === "s" || c === "S"){
201
this._state = BEFORE_SPECIAL_END;
202
} else {
203
this._state = TEXT;
204
this._index--;
205
}
206
} else {
207
this._state = IN_CLOSING_TAG_NAME;
208
this._sectionStart = this._index;
209
}
210
};
211
212
Tokenizer.prototype._stateInCloseingTagName = function(c){
213
if(c === ">" || whitespace(c)){
214
this._emitToken("onclosetag");
215
this._state = AFTER_CLOSING_TAG_NAME;
216
this._index--;
217
}
218
};
219
220
Tokenizer.prototype._stateAfterCloseingTagName = function(c){
221
//skip everything until ">"
222
if(c === ">"){
223
this._state = TEXT;
224
this._sectionStart = this._index + 1;
225
}
226
};
227
228
Tokenizer.prototype._stateBeforeAttributeName = function(c){
229
if(c === ">"){
230
this._cbs.onopentagend();
231
this._state = TEXT;
232
this._sectionStart = this._index + 1;
233
} else if(c === "/"){
234
this._state = IN_SELF_CLOSING_TAG;
235
} else if(!whitespace(c)){
236
this._state = IN_ATTRIBUTE_NAME;
237
this._sectionStart = this._index;
238
}
239
};
240
241
Tokenizer.prototype._stateInSelfClosingTag = function(c){
242
if(c === ">"){
243
this._cbs.onselfclosingtag();
244
this._state = TEXT;
245
this._sectionStart = this._index + 1;
246
} else if(!whitespace(c)){
247
this._state = BEFORE_ATTRIBUTE_NAME;
248
this._index--;
249
}
250
};
251
252
Tokenizer.prototype._stateInAttributeName = function(c){
253
if(c === "=" || c === "/" || c === ">" || whitespace(c)){
254
this._cbs.onattribname(this._getSection());
255
this._sectionStart = -1;
256
this._state = AFTER_ATTRIBUTE_NAME;
257
this._index--;
258
}
259
};
260
261
Tokenizer.prototype._stateAfterAttributeName = function(c){
262
if(c === "="){
263
this._state = BEFORE_ATTRIBUTE_VALUE;
264
} else if(c === "/" || c === ">"){
265
this._cbs.onattribend();
266
this._state = BEFORE_ATTRIBUTE_NAME;
267
this._index--;
268
} else if(!whitespace(c)){
269
this._cbs.onattribend();
270
this._state = IN_ATTRIBUTE_NAME;
271
this._sectionStart = this._index;
272
}
273
};
274
275
Tokenizer.prototype._stateBeforeAttributeValue = function(c){
276
if(c === "\""){
277
this._state = IN_ATTRIBUTE_VALUE_DQ;
278
this._sectionStart = this._index + 1;
279
} else if(c === "'"){
280
this._state = IN_ATTRIBUTE_VALUE_SQ;
281
this._sectionStart = this._index + 1;
282
} else if(!whitespace(c)){
283
this._state = IN_ATTRIBUTE_VALUE_NQ;
284
this._sectionStart = this._index;
285
this._index--; //reconsume token
286
}
287
};
288
289
Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){
290
if(c === "\""){
291
this._emitToken("onattribdata");
292
this._cbs.onattribend();
293
this._state = BEFORE_ATTRIBUTE_NAME;
294
} else if(this._decodeEntities && c === "&"){
295
this._emitToken("onattribdata");
296
this._baseState = this._state;
297
this._state = BEFORE_ENTITY;
298
this._sectionStart = this._index;
299
}
300
};
301
302
Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){
303
if(c === "'"){
304
this._emitToken("onattribdata");
305
this._cbs.onattribend();
306
this._state = BEFORE_ATTRIBUTE_NAME;
307
} else if(this._decodeEntities && c === "&"){
308
this._emitToken("onattribdata");
309
this._baseState = this._state;
310
this._state = BEFORE_ENTITY;
311
this._sectionStart = this._index;
312
}
313
};
314
315
Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){
316
if(whitespace(c) || c === ">"){
317
this._emitToken("onattribdata");
318
this._cbs.onattribend();
319
this._state = BEFORE_ATTRIBUTE_NAME;
320
this._index--;
321
} else if(this._decodeEntities && c === "&"){
322
this._emitToken("onattribdata");
323
this._baseState = this._state;
324
this._state = BEFORE_ENTITY;
325
this._sectionStart = this._index;
326
}
327
};
328
329
Tokenizer.prototype._stateBeforeDeclaration = function(c){
330
this._state = c === "[" ? BEFORE_CDATA_1 :
331
c === "-" ? BEFORE_COMMENT :
332
IN_DECLARATION;
333
};
334
335
Tokenizer.prototype._stateInDeclaration = function(c){
336
if(c === ">"){
337
this._cbs.ondeclaration(this._getSection());
338
this._state = TEXT;
339
this._sectionStart = this._index + 1;
340
}
341
};
342
343
Tokenizer.prototype._stateInProcessingInstruction = function(c){
344
if(c === ">"){
345
this._cbs.onprocessinginstruction(this._getSection());
346
this._state = TEXT;
347
this._sectionStart = this._index + 1;
348
}
349
};
350
351
Tokenizer.prototype._stateBeforeComment = function(c){
352
if(c === "-"){
353
this._state = IN_COMMENT;
354
this._sectionStart = this._index + 1;
355
} else {
356
this._state = IN_DECLARATION;
357
}
358
};
359
360
Tokenizer.prototype._stateInComment = function(c){
361
if(c === "-") this._state = AFTER_COMMENT_1;
362
};
363
364
Tokenizer.prototype._stateAfterComment1 = function(c){
365
if(c === "-"){
366
this._state = AFTER_COMMENT_2;
367
} else {
368
this._state = IN_COMMENT;
369
}
370
};
371
372
Tokenizer.prototype._stateAfterComment2 = function(c){
373
if(c === ">"){
374
//remove 2 trailing chars
375
this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2));
376
this._state = TEXT;
377
this._sectionStart = this._index + 1;
378
} else if(c !== "-"){
379
this._state = IN_COMMENT;
380
}
381
// else: stay in AFTER_COMMENT_2 (`--->`)
382
};
383
384
Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION);
385
Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION);
386
Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION);
387
Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION);
388
Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION);
389
390
Tokenizer.prototype._stateBeforeCdata6 = function(c){
391
if(c === "["){
392
this._state = IN_CDATA;
393
this._sectionStart = this._index + 1;
394
} else {
395
this._state = IN_DECLARATION;
396
this._index--;
397
}
398
};
399
400
Tokenizer.prototype._stateInCdata = function(c){
401
if(c === "]") this._state = AFTER_CDATA_1;
402
};
403
404
Tokenizer.prototype._stateAfterCdata1 = characterState("]", AFTER_CDATA_2);
405
406
Tokenizer.prototype._stateAfterCdata2 = function(c){
407
if(c === ">"){
408
//remove 2 trailing chars
409
this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2));
410
this._state = TEXT;
411
this._sectionStart = this._index + 1;
412
} else if(c !== "]") {
413
this._state = IN_CDATA;
414
}
415
//else: stay in AFTER_CDATA_2 (`]]]>`)
416
};
417
418
Tokenizer.prototype._stateBeforeSpecial = function(c){
419
if(c === "c" || c === "C"){
420
this._state = BEFORE_SCRIPT_1;
421
} else if(c === "t" || c === "T"){
422
this._state = BEFORE_STYLE_1;
423
} else {
424
this._state = IN_TAG_NAME;
425
this._index--; //consume the token again
426
}
427
};
428
429
Tokenizer.prototype._stateBeforeSpecialEnd = function(c){
430
if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){
431
this._state = AFTER_SCRIPT_1;
432
} else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){
433
this._state = AFTER_STYLE_1;
434
}
435
else this._state = TEXT;
436
};
437
438
Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2);
439
Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3);
440
Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4);
441
Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5);
442
443
Tokenizer.prototype._stateBeforeScript5 = function(c){
444
if(c === "/" || c === ">" || whitespace(c)){
445
this._special = SPECIAL_SCRIPT;
446
}
447
this._state = IN_TAG_NAME;
448
this._index--; //consume the token again
449
};
450
451
Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT);
452
Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT);
453
Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT);
454
Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT);
455
456
Tokenizer.prototype._stateAfterScript5 = function(c){
457
if(c === ">" || whitespace(c)){
458
this._special = SPECIAL_NONE;
459
this._state = IN_CLOSING_TAG_NAME;
460
this._sectionStart = this._index - 6;
461
this._index--; //reconsume the token
462
}
463
else this._state = TEXT;
464
};
465
466
Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2);
467
Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3);
468
Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4);
469
470
Tokenizer.prototype._stateBeforeStyle4 = function(c){
471
if(c === "/" || c === ">" || whitespace(c)){
472
this._special = SPECIAL_STYLE;
473
}
474
this._state = IN_TAG_NAME;
475
this._index--; //consume the token again
476
};
477
478
Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT);
479
Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT);
480
Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT);
481
482
Tokenizer.prototype._stateAfterStyle4 = function(c){
483
if(c === ">" || whitespace(c)){
484
this._special = SPECIAL_NONE;
485
this._state = IN_CLOSING_TAG_NAME;
486
this._sectionStart = this._index - 5;
487
this._index--; //reconsume the token
488
}
489
else this._state = TEXT;
490
};
491
492
Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY);
493
Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY);
494
495
//for entities terminated with a semicolon
496
Tokenizer.prototype._parseNamedEntityStrict = function(){
497
//offset = 1
498
if(this._sectionStart + 1 < this._index){
499
var entity = this._buffer.substring(this._sectionStart + 1, this._index),
500
map = this._xmlMode ? xmlMap : entityMap;
501
502
if(map.hasOwnProperty(entity)){
503
this._emitPartial(map[entity]);
504
this._sectionStart = this._index + 1;
505
}
506
}
507
};
508
509
510
//parses legacy entities (without trailing semicolon)
511
Tokenizer.prototype._parseLegacyEntity = function(){
512
var start = this._sectionStart + 1,
513
limit = this._index - start;
514
515
if(limit > 6) limit = 6; //the max length of legacy entities is 6
516
517
while(limit >= 2){ //the min length of legacy entities is 2
518
var entity = this._buffer.substr(start, limit);
519
520
if(legacyMap.hasOwnProperty(entity)){
521
this._emitPartial(legacyMap[entity]);
522
this._sectionStart += limit + 1;
523
return;
524
} else {
525
limit--;
526
}
527
}
528
};
529
530
Tokenizer.prototype._stateInNamedEntity = function(c){
531
if(c === ";"){
532
this._parseNamedEntityStrict();
533
if(this._sectionStart + 1 < this._index && !this._xmlMode){
534
this._parseLegacyEntity();
535
}
536
this._state = this._baseState;
537
} else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){
538
if(this._xmlMode);
539
else if(this._sectionStart + 1 === this._index);
540
else if(this._baseState !== TEXT){
541
if(c !== "="){
542
this._parseNamedEntityStrict();
543
}
544
} else {
545
this._parseLegacyEntity();
546
}
547
548
this._state = this._baseState;
549
this._index--;
550
}
551
};
552
553
Tokenizer.prototype._decodeNumericEntity = function(offset, base){
554
var sectionStart = this._sectionStart + offset;
555
556
if(sectionStart !== this._index){
557
//parse entity
558
var entity = this._buffer.substring(sectionStart, this._index);
559
var parsed = parseInt(entity, base);
560
561
this._emitPartial(decodeCodePoint(parsed));
562
this._sectionStart = this._index;
563
} else {
564
this._sectionStart--;
565
}
566
567
this._state = this._baseState;
568
};
569
570
Tokenizer.prototype._stateInNumericEntity = function(c){
571
if(c === ";"){
572
this._decodeNumericEntity(2, 10);
573
this._sectionStart++;
574
} else if(c < "0" || c > "9"){
575
if(!this._xmlMode){
576
this._decodeNumericEntity(2, 10);
577
} else {
578
this._state = this._baseState;
579
}
580
this._index--;
581
}
582
};
583
584
Tokenizer.prototype._stateInHexEntity = function(c){
585
if(c === ";"){
586
this._decodeNumericEntity(3, 16);
587
this._sectionStart++;
588
} else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){
589
if(!this._xmlMode){
590
this._decodeNumericEntity(3, 16);
591
} else {
592
this._state = this._baseState;
593
}
594
this._index--;
595
}
596
};
597
598
Tokenizer.prototype._cleanup = function (){
599
if(this._sectionStart < 0){
600
this._buffer = "";
601
this._index = 0;
602
this._bufferOffset += this._index;
603
} else if(this._running){
604
if(this._state === TEXT){
605
if(this._sectionStart !== this._index){
606
this._cbs.ontext(this._buffer.substr(this._sectionStart));
607
}
608
this._buffer = "";
609
this._index = 0;
610
this._bufferOffset += this._index;
611
} else if(this._sectionStart === this._index){
612
//the section just started
613
this._buffer = "";
614
this._index = 0;
615
this._bufferOffset += this._index;
616
} else {
617
//remove everything unnecessary
618
this._buffer = this._buffer.substr(this._sectionStart);
619
this._index -= this._sectionStart;
620
this._bufferOffset += this._sectionStart;
621
}
622
623
this._sectionStart = 0;
624
}
625
};
626
627
//TODO make events conditional
628
Tokenizer.prototype.write = function(chunk){
629
if(this._ended) this._cbs.onerror(Error(".write() after done!"));
630
631
this._buffer += chunk;
632
this._parse();
633
};
634
635
Tokenizer.prototype._parse = function(){
636
while(this._index < this._buffer.length && this._running){
637
var c = this._buffer.charAt(this._index);
638
if(this._state === TEXT) {
639
this._stateText(c);
640
} else if(this._state === BEFORE_TAG_NAME){
641
this._stateBeforeTagName(c);
642
} else if(this._state === IN_TAG_NAME) {
643
this._stateInTagName(c);
644
} else if(this._state === BEFORE_CLOSING_TAG_NAME){
645
this._stateBeforeCloseingTagName(c);
646
} else if(this._state === IN_CLOSING_TAG_NAME){
647
this._stateInCloseingTagName(c);
648
} else if(this._state === AFTER_CLOSING_TAG_NAME){
649
this._stateAfterCloseingTagName(c);
650
} else if(this._state === IN_SELF_CLOSING_TAG){
651
this._stateInSelfClosingTag(c);
652
}
653
654
/*
655
* attributes
656
*/
657
else if(this._state === BEFORE_ATTRIBUTE_NAME){
658
this._stateBeforeAttributeName(c);
659
} else if(this._state === IN_ATTRIBUTE_NAME){
660
this._stateInAttributeName(c);
661
} else if(this._state === AFTER_ATTRIBUTE_NAME){
662
this._stateAfterAttributeName(c);
663
} else if(this._state === BEFORE_ATTRIBUTE_VALUE){
664
this._stateBeforeAttributeValue(c);
665
} else if(this._state === IN_ATTRIBUTE_VALUE_DQ){
666
this._stateInAttributeValueDoubleQuotes(c);
667
} else if(this._state === IN_ATTRIBUTE_VALUE_SQ){
668
this._stateInAttributeValueSingleQuotes(c);
669
} else if(this._state === IN_ATTRIBUTE_VALUE_NQ){
670
this._stateInAttributeValueNoQuotes(c);
671
}
672
673
/*
674
* declarations
675
*/
676
else if(this._state === BEFORE_DECLARATION){
677
this._stateBeforeDeclaration(c);
678
} else if(this._state === IN_DECLARATION){
679
this._stateInDeclaration(c);
680
}
681
682
/*
683
* processing instructions
684
*/
685
else if(this._state === IN_PROCESSING_INSTRUCTION){
686
this._stateInProcessingInstruction(c);
687
}
688
689
/*
690
* comments
691
*/
692
else if(this._state === BEFORE_COMMENT){
693
this._stateBeforeComment(c);
694
} else if(this._state === IN_COMMENT){
695
this._stateInComment(c);
696
} else if(this._state === AFTER_COMMENT_1){
697
this._stateAfterComment1(c);
698
} else if(this._state === AFTER_COMMENT_2){
699
this._stateAfterComment2(c);
700
}
701
702
/*
703
* cdata
704
*/
705
else if(this._state === BEFORE_CDATA_1){
706
this._stateBeforeCdata1(c);
707
} else if(this._state === BEFORE_CDATA_2){
708
this._stateBeforeCdata2(c);
709
} else if(this._state === BEFORE_CDATA_3){
710
this._stateBeforeCdata3(c);
711
} else if(this._state === BEFORE_CDATA_4){
712
this._stateBeforeCdata4(c);
713
} else if(this._state === BEFORE_CDATA_5){
714
this._stateBeforeCdata5(c);
715
} else if(this._state === BEFORE_CDATA_6){
716
this._stateBeforeCdata6(c);
717
} else if(this._state === IN_CDATA){
718
this._stateInCdata(c);
719
} else if(this._state === AFTER_CDATA_1){
720
this._stateAfterCdata1(c);
721
} else if(this._state === AFTER_CDATA_2){
722
this._stateAfterCdata2(c);
723
}
724
725
/*
726
* special tags
727
*/
728
else if(this._state === BEFORE_SPECIAL){
729
this._stateBeforeSpecial(c);
730
} else if(this._state === BEFORE_SPECIAL_END){
731
this._stateBeforeSpecialEnd(c);
732
}
733
734
/*
735
* script
736
*/
737
else if(this._state === BEFORE_SCRIPT_1){
738
this._stateBeforeScript1(c);
739
} else if(this._state === BEFORE_SCRIPT_2){
740
this._stateBeforeScript2(c);
741
} else if(this._state === BEFORE_SCRIPT_3){
742
this._stateBeforeScript3(c);
743
} else if(this._state === BEFORE_SCRIPT_4){
744
this._stateBeforeScript4(c);
745
} else if(this._state === BEFORE_SCRIPT_5){
746
this._stateBeforeScript5(c);
747
}
748
749
else if(this._state === AFTER_SCRIPT_1){
750
this._stateAfterScript1(c);
751
} else if(this._state === AFTER_SCRIPT_2){
752
this._stateAfterScript2(c);
753
} else if(this._state === AFTER_SCRIPT_3){
754
this._stateAfterScript3(c);
755
} else if(this._state === AFTER_SCRIPT_4){
756
this._stateAfterScript4(c);
757
} else if(this._state === AFTER_SCRIPT_5){
758
this._stateAfterScript5(c);
759
}
760
761
/*
762
* style
763
*/
764
else if(this._state === BEFORE_STYLE_1){
765
this._stateBeforeStyle1(c);
766
} else if(this._state === BEFORE_STYLE_2){
767
this._stateBeforeStyle2(c);
768
} else if(this._state === BEFORE_STYLE_3){
769
this._stateBeforeStyle3(c);
770
} else if(this._state === BEFORE_STYLE_4){
771
this._stateBeforeStyle4(c);
772
}
773
774
else if(this._state === AFTER_STYLE_1){
775
this._stateAfterStyle1(c);
776
} else if(this._state === AFTER_STYLE_2){
777
this._stateAfterStyle2(c);
778
} else if(this._state === AFTER_STYLE_3){
779
this._stateAfterStyle3(c);
780
} else if(this._state === AFTER_STYLE_4){
781
this._stateAfterStyle4(c);
782
}
783
784
/*
785
* entities
786
*/
787
else if(this._state === BEFORE_ENTITY){
788
this._stateBeforeEntity(c);
789
} else if(this._state === BEFORE_NUMERIC_ENTITY){
790
this._stateBeforeNumericEntity(c);
791
} else if(this._state === IN_NAMED_ENTITY){
792
this._stateInNamedEntity(c);
793
} else if(this._state === IN_NUMERIC_ENTITY){
794
this._stateInNumericEntity(c);
795
} else if(this._state === IN_HEX_ENTITY){
796
this._stateInHexEntity(c);
797
}
798
799
else {
800
this._cbs.onerror(Error("unknown _state"), this._state);
801
}
802
803
this._index++;
804
}
805
806
this._cleanup();
807
};
808
809
Tokenizer.prototype.pause = function(){
810
this._running = false;
811
};
812
Tokenizer.prototype.resume = function(){
813
this._running = true;
814
815
if(this._index < this._buffer.length){
816
this._parse();
817
}
818
if(this._ended){
819
this._finish();
820
}
821
};
822
823
Tokenizer.prototype.end = function(chunk){
824
if(this._ended) this._cbs.onerror(Error(".end() after done!"));
825
if(chunk) this.write(chunk);
826
827
this._ended = true;
828
829
if(this._running) this._finish();
830
};
831
832
Tokenizer.prototype._finish = function(){
833
//if there is remaining data, emit it in a reasonable way
834
if(this._sectionStart < this._index){
835
this._handleTrailingData();
836
}
837
838
this._cbs.onend();
839
};
840
841
Tokenizer.prototype._handleTrailingData = function(){
842
var data = this._buffer.substr(this._sectionStart);
843
844
if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
845
this._cbs.oncdata(data);
846
} else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
847
this._cbs.oncomment(data);
848
} else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
849
this._parseLegacyEntity();
850
if(this._sectionStart < this._index){
851
this._state = this._baseState;
852
this._handleTrailingData();
853
}
854
} else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
855
this._decodeNumericEntity(2, 10);
856
if(this._sectionStart < this._index){
857
this._state = this._baseState;
858
this._handleTrailingData();
859
}
860
} else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
861
this._decodeNumericEntity(3, 16);
862
if(this._sectionStart < this._index){
863
this._state = this._baseState;
864
this._handleTrailingData();
865
}
866
} else if(
867
this._state !== IN_TAG_NAME &&
868
this._state !== BEFORE_ATTRIBUTE_NAME &&
869
this._state !== BEFORE_ATTRIBUTE_VALUE &&
870
this._state !== AFTER_ATTRIBUTE_NAME &&
871
this._state !== IN_ATTRIBUTE_NAME &&
872
this._state !== IN_ATTRIBUTE_VALUE_SQ &&
873
this._state !== IN_ATTRIBUTE_VALUE_DQ &&
874
this._state !== IN_ATTRIBUTE_VALUE_NQ &&
875
this._state !== IN_CLOSING_TAG_NAME
876
){
877
this._cbs.ontext(data);
878
}
879
//else, ignore remaining data
880
//TODO add a way to remove current tag
881
};
882
883
Tokenizer.prototype.reset = function(){
884
Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs);
885
};
886
887
Tokenizer.prototype.getAbsoluteIndex = function(){
888
return this._bufferOffset + this._index;
889
};
890
891
Tokenizer.prototype._getSection = function(){
892
return this._buffer.substring(this._sectionStart, this._index);
893
};
894
895
Tokenizer.prototype._emitToken = function(name){
896
this._cbs[name](this._getSection());
897
this._sectionStart = -1;
898
};
899
900
Tokenizer.prototype._emitPartial = function(value){
901
if(this._baseState !== TEXT){
902
this._cbs.onattribdata(value); //TODO implement the new event
903
} else {
904
this._cbs.ontext(value);
905
}
906
};
907
908