Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/xml2/xmlregexp.c
4393 views
1
/*
2
* regexp.c: generic and extensible Regular Expression engine
3
*
4
* Basically designed with the purpose of compiling regexps for
5
* the variety of validation/schemas mechanisms now available in
6
* XML related specifications these include:
7
* - XML-1.0 DTD validation
8
* - XML Schemas structure part 1
9
* - XML Schemas Datatypes part 2 especially Appendix F
10
* - RELAX-NG/TREX i.e. the counter proposal
11
*
12
* See Copyright for the status of this software.
13
*
14
* Daniel Veillard <[email protected]>
15
*/
16
17
#define IN_LIBXML
18
#include "libxml.h"
19
20
#ifdef LIBXML_REGEXP_ENABLED
21
22
#include <stdio.h>
23
#include <string.h>
24
#include <limits.h>
25
26
#include <libxml/tree.h>
27
#include <libxml/parserInternals.h>
28
#include <libxml/xmlregexp.h>
29
#include <libxml/xmlautomata.h>
30
#include <libxml/xmlunicode.h>
31
32
#include "private/error.h"
33
#include "private/regexp.h"
34
35
#ifndef SIZE_MAX
36
#define SIZE_MAX ((size_t) -1)
37
#endif
38
39
#define MAX_PUSH 10000000
40
41
/*
42
* -2 and -3 are used by xmlValidateElementType for other things.
43
*/
44
#define XML_REGEXP_OK 0
45
#define XML_REGEXP_NOT_FOUND (-1)
46
#define XML_REGEXP_INTERNAL_ERROR (-4)
47
#define XML_REGEXP_OUT_OF_MEMORY (-5)
48
#define XML_REGEXP_INTERNAL_LIMIT (-6)
49
#define XML_REGEXP_INVALID_UTF8 (-7)
50
51
#ifdef ERROR
52
#undef ERROR
53
#endif
54
#define ERROR(str) \
55
ctxt->error = XML_REGEXP_COMPILE_ERROR; \
56
xmlRegexpErrCompile(ctxt, str);
57
#define NEXT ctxt->cur++
58
#define CUR (*(ctxt->cur))
59
#define NXT(index) (ctxt->cur[index])
60
61
#define NEXTL(l) ctxt->cur += l;
62
#define XML_REG_STRING_SEPARATOR '|'
63
/*
64
* Need PREV to check on a '-' within a Character Group. May only be used
65
* when it's guaranteed that cur is not at the beginning of ctxt->string!
66
*/
67
#define PREV (ctxt->cur[-1])
68
69
/**
70
* TODO:
71
*
72
* macro to flag unimplemented blocks
73
*/
74
#define TODO \
75
xmlGenericError(xmlGenericErrorContext, \
76
"Unimplemented block at %s:%d\n", \
77
__FILE__, __LINE__);
78
79
/************************************************************************
80
* *
81
* Datatypes and structures *
82
* *
83
************************************************************************/
84
85
/*
86
* Note: the order of the enums below is significant, do not shuffle
87
*/
88
typedef enum {
89
XML_REGEXP_EPSILON = 1,
90
XML_REGEXP_CHARVAL,
91
XML_REGEXP_RANGES,
92
XML_REGEXP_SUBREG, /* used for () sub regexps */
93
XML_REGEXP_STRING,
94
XML_REGEXP_ANYCHAR, /* . */
95
XML_REGEXP_ANYSPACE, /* \s */
96
XML_REGEXP_NOTSPACE, /* \S */
97
XML_REGEXP_INITNAME, /* \l */
98
XML_REGEXP_NOTINITNAME, /* \L */
99
XML_REGEXP_NAMECHAR, /* \c */
100
XML_REGEXP_NOTNAMECHAR, /* \C */
101
XML_REGEXP_DECIMAL, /* \d */
102
XML_REGEXP_NOTDECIMAL, /* \D */
103
XML_REGEXP_REALCHAR, /* \w */
104
XML_REGEXP_NOTREALCHAR, /* \W */
105
XML_REGEXP_LETTER = 100,
106
XML_REGEXP_LETTER_UPPERCASE,
107
XML_REGEXP_LETTER_LOWERCASE,
108
XML_REGEXP_LETTER_TITLECASE,
109
XML_REGEXP_LETTER_MODIFIER,
110
XML_REGEXP_LETTER_OTHERS,
111
XML_REGEXP_MARK,
112
XML_REGEXP_MARK_NONSPACING,
113
XML_REGEXP_MARK_SPACECOMBINING,
114
XML_REGEXP_MARK_ENCLOSING,
115
XML_REGEXP_NUMBER,
116
XML_REGEXP_NUMBER_DECIMAL,
117
XML_REGEXP_NUMBER_LETTER,
118
XML_REGEXP_NUMBER_OTHERS,
119
XML_REGEXP_PUNCT,
120
XML_REGEXP_PUNCT_CONNECTOR,
121
XML_REGEXP_PUNCT_DASH,
122
XML_REGEXP_PUNCT_OPEN,
123
XML_REGEXP_PUNCT_CLOSE,
124
XML_REGEXP_PUNCT_INITQUOTE,
125
XML_REGEXP_PUNCT_FINQUOTE,
126
XML_REGEXP_PUNCT_OTHERS,
127
XML_REGEXP_SEPAR,
128
XML_REGEXP_SEPAR_SPACE,
129
XML_REGEXP_SEPAR_LINE,
130
XML_REGEXP_SEPAR_PARA,
131
XML_REGEXP_SYMBOL,
132
XML_REGEXP_SYMBOL_MATH,
133
XML_REGEXP_SYMBOL_CURRENCY,
134
XML_REGEXP_SYMBOL_MODIFIER,
135
XML_REGEXP_SYMBOL_OTHERS,
136
XML_REGEXP_OTHER,
137
XML_REGEXP_OTHER_CONTROL,
138
XML_REGEXP_OTHER_FORMAT,
139
XML_REGEXP_OTHER_PRIVATE,
140
XML_REGEXP_OTHER_NA,
141
XML_REGEXP_BLOCK_NAME
142
} xmlRegAtomType;
143
144
typedef enum {
145
XML_REGEXP_QUANT_EPSILON = 1,
146
XML_REGEXP_QUANT_ONCE,
147
XML_REGEXP_QUANT_OPT,
148
XML_REGEXP_QUANT_MULT,
149
XML_REGEXP_QUANT_PLUS,
150
XML_REGEXP_QUANT_ONCEONLY,
151
XML_REGEXP_QUANT_ALL,
152
XML_REGEXP_QUANT_RANGE
153
} xmlRegQuantType;
154
155
typedef enum {
156
XML_REGEXP_START_STATE = 1,
157
XML_REGEXP_FINAL_STATE,
158
XML_REGEXP_TRANS_STATE,
159
XML_REGEXP_SINK_STATE,
160
XML_REGEXP_UNREACH_STATE
161
} xmlRegStateType;
162
163
typedef enum {
164
XML_REGEXP_MARK_NORMAL = 0,
165
XML_REGEXP_MARK_START,
166
XML_REGEXP_MARK_VISITED
167
} xmlRegMarkedType;
168
169
typedef struct _xmlRegRange xmlRegRange;
170
typedef xmlRegRange *xmlRegRangePtr;
171
172
struct _xmlRegRange {
173
int neg; /* 0 normal, 1 not, 2 exclude */
174
xmlRegAtomType type;
175
int start;
176
int end;
177
xmlChar *blockName;
178
};
179
180
typedef struct _xmlRegAtom xmlRegAtom;
181
typedef xmlRegAtom *xmlRegAtomPtr;
182
183
typedef struct _xmlAutomataState xmlRegState;
184
typedef xmlRegState *xmlRegStatePtr;
185
186
struct _xmlRegAtom {
187
int no;
188
xmlRegAtomType type;
189
xmlRegQuantType quant;
190
int min;
191
int max;
192
193
void *valuep;
194
void *valuep2;
195
int neg;
196
int codepoint;
197
xmlRegStatePtr start;
198
xmlRegStatePtr start0;
199
xmlRegStatePtr stop;
200
int maxRanges;
201
int nbRanges;
202
xmlRegRangePtr *ranges;
203
void *data;
204
};
205
206
typedef struct _xmlRegCounter xmlRegCounter;
207
typedef xmlRegCounter *xmlRegCounterPtr;
208
209
struct _xmlRegCounter {
210
int min;
211
int max;
212
};
213
214
typedef struct _xmlRegTrans xmlRegTrans;
215
typedef xmlRegTrans *xmlRegTransPtr;
216
217
struct _xmlRegTrans {
218
xmlRegAtomPtr atom;
219
int to;
220
int counter;
221
int count;
222
int nd;
223
};
224
225
struct _xmlAutomataState {
226
xmlRegStateType type;
227
xmlRegMarkedType mark;
228
xmlRegMarkedType markd;
229
xmlRegMarkedType reached;
230
int no;
231
int maxTrans;
232
int nbTrans;
233
xmlRegTrans *trans;
234
/* knowing states pointing to us can speed things up */
235
int maxTransTo;
236
int nbTransTo;
237
int *transTo;
238
};
239
240
typedef struct _xmlAutomata xmlRegParserCtxt;
241
typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
242
243
#define AM_AUTOMATA_RNG 1
244
245
struct _xmlAutomata {
246
xmlChar *string;
247
xmlChar *cur;
248
249
int error;
250
int neg;
251
252
xmlRegStatePtr start;
253
xmlRegStatePtr end;
254
xmlRegStatePtr state;
255
256
xmlRegAtomPtr atom;
257
258
int maxAtoms;
259
int nbAtoms;
260
xmlRegAtomPtr *atoms;
261
262
int maxStates;
263
int nbStates;
264
xmlRegStatePtr *states;
265
266
int maxCounters;
267
int nbCounters;
268
xmlRegCounter *counters;
269
270
int determinist;
271
int negs;
272
int flags;
273
274
int depth;
275
};
276
277
struct _xmlRegexp {
278
xmlChar *string;
279
int nbStates;
280
xmlRegStatePtr *states;
281
int nbAtoms;
282
xmlRegAtomPtr *atoms;
283
int nbCounters;
284
xmlRegCounter *counters;
285
int determinist;
286
int flags;
287
/*
288
* That's the compact form for determinists automatas
289
*/
290
int nbstates;
291
int *compact;
292
void **transdata;
293
int nbstrings;
294
xmlChar **stringMap;
295
};
296
297
typedef struct _xmlRegExecRollback xmlRegExecRollback;
298
typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
299
300
struct _xmlRegExecRollback {
301
xmlRegStatePtr state;/* the current state */
302
int index; /* the index in the input stack */
303
int nextbranch; /* the next transition to explore in that state */
304
int *counts; /* save the automata state if it has some */
305
};
306
307
typedef struct _xmlRegInputToken xmlRegInputToken;
308
typedef xmlRegInputToken *xmlRegInputTokenPtr;
309
310
struct _xmlRegInputToken {
311
xmlChar *value;
312
void *data;
313
};
314
315
struct _xmlRegExecCtxt {
316
int status; /* execution status != 0 indicate an error */
317
int determinist; /* did we find an indeterministic behaviour */
318
xmlRegexpPtr comp; /* the compiled regexp */
319
xmlRegExecCallbacks callback;
320
void *data;
321
322
xmlRegStatePtr state;/* the current state */
323
int transno; /* the current transition on that state */
324
int transcount; /* the number of chars in char counted transitions */
325
326
/*
327
* A stack of rollback states
328
*/
329
int maxRollbacks;
330
int nbRollbacks;
331
xmlRegExecRollback *rollbacks;
332
333
/*
334
* The state of the automata if any
335
*/
336
int *counts;
337
338
/*
339
* The input stack
340
*/
341
int inputStackMax;
342
int inputStackNr;
343
int index;
344
int *charStack;
345
const xmlChar *inputString; /* when operating on characters */
346
xmlRegInputTokenPtr inputStack;/* when operating on strings */
347
348
/*
349
* error handling
350
*/
351
int errStateNo; /* the error state number */
352
xmlRegStatePtr errState; /* the error state */
353
xmlChar *errString; /* the string raising the error */
354
int *errCounts; /* counters at the error state */
355
int nbPush;
356
};
357
358
#define REGEXP_ALL_COUNTER 0x123456
359
#define REGEXP_ALL_LAX_COUNTER 0x123457
360
361
static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
362
static void xmlRegFreeState(xmlRegStatePtr state);
363
static void xmlRegFreeAtom(xmlRegAtomPtr atom);
364
static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
365
static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
366
static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
367
int neg, int start, int end, const xmlChar *blockName);
368
369
/************************************************************************
370
* *
371
* Regexp memory error handler *
372
* *
373
************************************************************************/
374
/**
375
* xmlRegexpErrMemory:
376
* @extra: extra information
377
*
378
* Handle an out of memory condition
379
*/
380
static void
381
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
382
{
383
const char *regexp = NULL;
384
if (ctxt != NULL) {
385
regexp = (const char *) ctxt->string;
386
ctxt->error = XML_ERR_NO_MEMORY;
387
}
388
__xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
389
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
390
regexp, NULL, 0, 0,
391
"Memory allocation failed : %s\n", extra);
392
}
393
394
/**
395
* xmlRegexpErrCompile:
396
* @extra: extra information
397
*
398
* Handle a compilation failure
399
*/
400
static void
401
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
402
{
403
const char *regexp = NULL;
404
int idx = 0;
405
406
if (ctxt != NULL) {
407
regexp = (const char *) ctxt->string;
408
idx = ctxt->cur - ctxt->string;
409
ctxt->error = XML_REGEXP_COMPILE_ERROR;
410
}
411
__xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
412
XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
413
regexp, NULL, idx, 0,
414
"failed to compile: %s\n", extra);
415
}
416
417
/************************************************************************
418
* *
419
* Allocation/Deallocation *
420
* *
421
************************************************************************/
422
423
static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
424
425
/**
426
* xmlRegCalloc2:
427
* @dim1: size of first dimension
428
* @dim2: size of second dimension
429
* @elemSize: size of element
430
*
431
* Allocate a two-dimensional array and set all elements to zero.
432
*
433
* Returns the new array or NULL in case of error.
434
*/
435
static void*
436
xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
437
size_t totalSize;
438
void *ret;
439
440
/* Check for overflow */
441
if ((dim2 == 0) || (elemSize == 0) ||
442
(dim1 > SIZE_MAX / dim2 / elemSize))
443
return (NULL);
444
totalSize = dim1 * dim2 * elemSize;
445
ret = xmlMalloc(totalSize);
446
if (ret != NULL)
447
memset(ret, 0, totalSize);
448
return (ret);
449
}
450
451
/**
452
* xmlRegEpxFromParse:
453
* @ctxt: the parser context used to build it
454
*
455
* Allocate a new regexp and fill it with the result from the parser
456
*
457
* Returns the new regexp or NULL in case of error
458
*/
459
static xmlRegexpPtr
460
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
461
xmlRegexpPtr ret;
462
463
ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
464
if (ret == NULL) {
465
xmlRegexpErrMemory(ctxt, "compiling regexp");
466
return(NULL);
467
}
468
memset(ret, 0, sizeof(xmlRegexp));
469
ret->string = ctxt->string;
470
ret->nbStates = ctxt->nbStates;
471
ret->states = ctxt->states;
472
ret->nbAtoms = ctxt->nbAtoms;
473
ret->atoms = ctxt->atoms;
474
ret->nbCounters = ctxt->nbCounters;
475
ret->counters = ctxt->counters;
476
ret->determinist = ctxt->determinist;
477
ret->flags = ctxt->flags;
478
if (ret->determinist == -1) {
479
if (xmlRegexpIsDeterminist(ret) < 0) {
480
xmlRegexpErrMemory(ctxt, "checking determinism");
481
xmlFree(ret);
482
return(NULL);
483
}
484
}
485
486
if ((ret->determinist != 0) &&
487
(ret->nbCounters == 0) &&
488
(ctxt->negs == 0) &&
489
(ret->atoms != NULL) &&
490
(ret->atoms[0] != NULL) &&
491
(ret->atoms[0]->type == XML_REGEXP_STRING)) {
492
int i, j, nbstates = 0, nbatoms = 0;
493
int *stateRemap;
494
int *stringRemap;
495
int *transitions;
496
void **transdata;
497
xmlChar **stringMap;
498
xmlChar *value;
499
500
/*
501
* Switch to a compact representation
502
* 1/ counting the effective number of states left
503
* 2/ counting the unique number of atoms, and check that
504
* they are all of the string type
505
* 3/ build a table state x atom for the transitions
506
*/
507
508
stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
509
if (stateRemap == NULL) {
510
xmlRegexpErrMemory(ctxt, "compiling regexp");
511
xmlFree(ret);
512
return(NULL);
513
}
514
for (i = 0;i < ret->nbStates;i++) {
515
if (ret->states[i] != NULL) {
516
stateRemap[i] = nbstates;
517
nbstates++;
518
} else {
519
stateRemap[i] = -1;
520
}
521
}
522
stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
523
if (stringMap == NULL) {
524
xmlRegexpErrMemory(ctxt, "compiling regexp");
525
xmlFree(stateRemap);
526
xmlFree(ret);
527
return(NULL);
528
}
529
stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
530
if (stringRemap == NULL) {
531
xmlRegexpErrMemory(ctxt, "compiling regexp");
532
xmlFree(stringMap);
533
xmlFree(stateRemap);
534
xmlFree(ret);
535
return(NULL);
536
}
537
for (i = 0;i < ret->nbAtoms;i++) {
538
if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
539
(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
540
value = ret->atoms[i]->valuep;
541
for (j = 0;j < nbatoms;j++) {
542
if (xmlStrEqual(stringMap[j], value)) {
543
stringRemap[i] = j;
544
break;
545
}
546
}
547
if (j >= nbatoms) {
548
stringRemap[i] = nbatoms;
549
stringMap[nbatoms] = xmlStrdup(value);
550
if (stringMap[nbatoms] == NULL) {
551
for (i = 0;i < nbatoms;i++)
552
xmlFree(stringMap[i]);
553
xmlFree(stringRemap);
554
xmlFree(stringMap);
555
xmlFree(stateRemap);
556
xmlFree(ret);
557
return(NULL);
558
}
559
nbatoms++;
560
}
561
} else {
562
xmlFree(stateRemap);
563
xmlFree(stringRemap);
564
for (i = 0;i < nbatoms;i++)
565
xmlFree(stringMap[i]);
566
xmlFree(stringMap);
567
xmlFree(ret);
568
return(NULL);
569
}
570
}
571
transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
572
sizeof(int));
573
if (transitions == NULL) {
574
xmlFree(stateRemap);
575
xmlFree(stringRemap);
576
for (i = 0;i < nbatoms;i++)
577
xmlFree(stringMap[i]);
578
xmlFree(stringMap);
579
xmlFree(ret);
580
return(NULL);
581
}
582
583
/*
584
* Allocate the transition table. The first entry for each
585
* state corresponds to the state type.
586
*/
587
transdata = NULL;
588
589
for (i = 0;i < ret->nbStates;i++) {
590
int stateno, atomno, targetno, prev;
591
xmlRegStatePtr state;
592
xmlRegTransPtr trans;
593
594
stateno = stateRemap[i];
595
if (stateno == -1)
596
continue;
597
state = ret->states[i];
598
599
transitions[stateno * (nbatoms + 1)] = state->type;
600
601
for (j = 0;j < state->nbTrans;j++) {
602
trans = &(state->trans[j]);
603
if ((trans->to < 0) || (trans->atom == NULL))
604
continue;
605
atomno = stringRemap[trans->atom->no];
606
if ((trans->atom->data != NULL) && (transdata == NULL)) {
607
transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
608
sizeof(void *));
609
if (transdata == NULL) {
610
xmlRegexpErrMemory(ctxt, "compiling regexp");
611
break;
612
}
613
}
614
targetno = stateRemap[trans->to];
615
/*
616
* if the same atom can generate transitions to 2 different
617
* states then it means the automata is not deterministic and
618
* the compact form can't be used !
619
*/
620
prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
621
if (prev != 0) {
622
if (prev != targetno + 1) {
623
ret->determinist = 0;
624
if (transdata != NULL)
625
xmlFree(transdata);
626
xmlFree(transitions);
627
xmlFree(stateRemap);
628
xmlFree(stringRemap);
629
for (i = 0;i < nbatoms;i++)
630
xmlFree(stringMap[i]);
631
xmlFree(stringMap);
632
goto not_determ;
633
}
634
} else {
635
#if 0
636
printf("State %d trans %d: atom %d to %d : %d to %d\n",
637
i, j, trans->atom->no, trans->to, atomno, targetno);
638
#endif
639
transitions[stateno * (nbatoms + 1) + atomno + 1] =
640
targetno + 1; /* to avoid 0 */
641
if (transdata != NULL)
642
transdata[stateno * nbatoms + atomno] =
643
trans->atom->data;
644
}
645
}
646
}
647
ret->determinist = 1;
648
/*
649
* Cleanup of the old data
650
*/
651
if (ret->states != NULL) {
652
for (i = 0;i < ret->nbStates;i++)
653
xmlRegFreeState(ret->states[i]);
654
xmlFree(ret->states);
655
}
656
ret->states = NULL;
657
ret->nbStates = 0;
658
if (ret->atoms != NULL) {
659
for (i = 0;i < ret->nbAtoms;i++)
660
xmlRegFreeAtom(ret->atoms[i]);
661
xmlFree(ret->atoms);
662
}
663
ret->atoms = NULL;
664
ret->nbAtoms = 0;
665
666
ret->compact = transitions;
667
ret->transdata = transdata;
668
ret->stringMap = stringMap;
669
ret->nbstrings = nbatoms;
670
ret->nbstates = nbstates;
671
xmlFree(stateRemap);
672
xmlFree(stringRemap);
673
}
674
not_determ:
675
ctxt->string = NULL;
676
ctxt->nbStates = 0;
677
ctxt->states = NULL;
678
ctxt->nbAtoms = 0;
679
ctxt->atoms = NULL;
680
ctxt->nbCounters = 0;
681
ctxt->counters = NULL;
682
return(ret);
683
}
684
685
/**
686
* xmlRegNewParserCtxt:
687
* @string: the string to parse
688
*
689
* Allocate a new regexp parser context
690
*
691
* Returns the new context or NULL in case of error
692
*/
693
static xmlRegParserCtxtPtr
694
xmlRegNewParserCtxt(const xmlChar *string) {
695
xmlRegParserCtxtPtr ret;
696
697
ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
698
if (ret == NULL)
699
return(NULL);
700
memset(ret, 0, sizeof(xmlRegParserCtxt));
701
if (string != NULL)
702
ret->string = xmlStrdup(string);
703
ret->cur = ret->string;
704
ret->neg = 0;
705
ret->negs = 0;
706
ret->error = 0;
707
ret->determinist = -1;
708
return(ret);
709
}
710
711
/**
712
* xmlRegNewRange:
713
* @ctxt: the regexp parser context
714
* @neg: is that negative
715
* @type: the type of range
716
* @start: the start codepoint
717
* @end: the end codepoint
718
*
719
* Allocate a new regexp range
720
*
721
* Returns the new range or NULL in case of error
722
*/
723
static xmlRegRangePtr
724
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
725
int neg, xmlRegAtomType type, int start, int end) {
726
xmlRegRangePtr ret;
727
728
ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
729
if (ret == NULL) {
730
xmlRegexpErrMemory(ctxt, "allocating range");
731
return(NULL);
732
}
733
ret->neg = neg;
734
ret->type = type;
735
ret->start = start;
736
ret->end = end;
737
return(ret);
738
}
739
740
/**
741
* xmlRegFreeRange:
742
* @range: the regexp range
743
*
744
* Free a regexp range
745
*/
746
static void
747
xmlRegFreeRange(xmlRegRangePtr range) {
748
if (range == NULL)
749
return;
750
751
if (range->blockName != NULL)
752
xmlFree(range->blockName);
753
xmlFree(range);
754
}
755
756
/**
757
* xmlRegCopyRange:
758
* @range: the regexp range
759
*
760
* Copy a regexp range
761
*
762
* Returns the new copy or NULL in case of error.
763
*/
764
static xmlRegRangePtr
765
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
766
xmlRegRangePtr ret;
767
768
if (range == NULL)
769
return(NULL);
770
771
ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
772
range->end);
773
if (ret == NULL)
774
return(NULL);
775
if (range->blockName != NULL) {
776
ret->blockName = xmlStrdup(range->blockName);
777
if (ret->blockName == NULL) {
778
xmlRegexpErrMemory(ctxt, "allocating range");
779
xmlRegFreeRange(ret);
780
return(NULL);
781
}
782
}
783
return(ret);
784
}
785
786
/**
787
* xmlRegNewAtom:
788
* @ctxt: the regexp parser context
789
* @type: the type of atom
790
*
791
* Allocate a new atom
792
*
793
* Returns the new atom or NULL in case of error
794
*/
795
static xmlRegAtomPtr
796
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
797
xmlRegAtomPtr ret;
798
799
ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
800
if (ret == NULL) {
801
xmlRegexpErrMemory(ctxt, "allocating atom");
802
return(NULL);
803
}
804
memset(ret, 0, sizeof(xmlRegAtom));
805
ret->type = type;
806
ret->quant = XML_REGEXP_QUANT_ONCE;
807
ret->min = 0;
808
ret->max = 0;
809
return(ret);
810
}
811
812
/**
813
* xmlRegFreeAtom:
814
* @atom: the regexp atom
815
*
816
* Free a regexp atom
817
*/
818
static void
819
xmlRegFreeAtom(xmlRegAtomPtr atom) {
820
int i;
821
822
if (atom == NULL)
823
return;
824
825
for (i = 0;i < atom->nbRanges;i++)
826
xmlRegFreeRange(atom->ranges[i]);
827
if (atom->ranges != NULL)
828
xmlFree(atom->ranges);
829
if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
830
xmlFree(atom->valuep);
831
if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
832
xmlFree(atom->valuep2);
833
if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
834
xmlFree(atom->valuep);
835
xmlFree(atom);
836
}
837
838
/**
839
* xmlRegCopyAtom:
840
* @ctxt: the regexp parser context
841
* @atom: the original atom
842
*
843
* Allocate a new regexp range
844
*
845
* Returns the new atom or NULL in case of error
846
*/
847
static xmlRegAtomPtr
848
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
849
xmlRegAtomPtr ret;
850
851
ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
852
if (ret == NULL) {
853
xmlRegexpErrMemory(ctxt, "copying atom");
854
return(NULL);
855
}
856
memset(ret, 0, sizeof(xmlRegAtom));
857
ret->type = atom->type;
858
ret->quant = atom->quant;
859
ret->min = atom->min;
860
ret->max = atom->max;
861
if (atom->nbRanges > 0) {
862
int i;
863
864
ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
865
atom->nbRanges);
866
if (ret->ranges == NULL) {
867
xmlRegexpErrMemory(ctxt, "copying atom");
868
goto error;
869
}
870
for (i = 0;i < atom->nbRanges;i++) {
871
ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
872
if (ret->ranges[i] == NULL)
873
goto error;
874
ret->nbRanges = i + 1;
875
}
876
}
877
return(ret);
878
879
error:
880
xmlRegFreeAtom(ret);
881
return(NULL);
882
}
883
884
static xmlRegStatePtr
885
xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
886
xmlRegStatePtr ret;
887
888
ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
889
if (ret == NULL) {
890
xmlRegexpErrMemory(ctxt, "allocating state");
891
return(NULL);
892
}
893
memset(ret, 0, sizeof(xmlRegState));
894
ret->type = XML_REGEXP_TRANS_STATE;
895
ret->mark = XML_REGEXP_MARK_NORMAL;
896
return(ret);
897
}
898
899
/**
900
* xmlRegFreeState:
901
* @state: the regexp state
902
*
903
* Free a regexp state
904
*/
905
static void
906
xmlRegFreeState(xmlRegStatePtr state) {
907
if (state == NULL)
908
return;
909
910
if (state->trans != NULL)
911
xmlFree(state->trans);
912
if (state->transTo != NULL)
913
xmlFree(state->transTo);
914
xmlFree(state);
915
}
916
917
/**
918
* xmlRegFreeParserCtxt:
919
* @ctxt: the regexp parser context
920
*
921
* Free a regexp parser context
922
*/
923
static void
924
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
925
int i;
926
if (ctxt == NULL)
927
return;
928
929
if (ctxt->string != NULL)
930
xmlFree(ctxt->string);
931
if (ctxt->states != NULL) {
932
for (i = 0;i < ctxt->nbStates;i++)
933
xmlRegFreeState(ctxt->states[i]);
934
xmlFree(ctxt->states);
935
}
936
if (ctxt->atoms != NULL) {
937
for (i = 0;i < ctxt->nbAtoms;i++)
938
xmlRegFreeAtom(ctxt->atoms[i]);
939
xmlFree(ctxt->atoms);
940
}
941
if (ctxt->counters != NULL)
942
xmlFree(ctxt->counters);
943
xmlFree(ctxt);
944
}
945
946
/************************************************************************
947
* *
948
* Display of Data structures *
949
* *
950
************************************************************************/
951
952
static void
953
xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
954
switch (type) {
955
case XML_REGEXP_EPSILON:
956
fprintf(output, "epsilon "); break;
957
case XML_REGEXP_CHARVAL:
958
fprintf(output, "charval "); break;
959
case XML_REGEXP_RANGES:
960
fprintf(output, "ranges "); break;
961
case XML_REGEXP_SUBREG:
962
fprintf(output, "subexpr "); break;
963
case XML_REGEXP_STRING:
964
fprintf(output, "string "); break;
965
case XML_REGEXP_ANYCHAR:
966
fprintf(output, "anychar "); break;
967
case XML_REGEXP_ANYSPACE:
968
fprintf(output, "anyspace "); break;
969
case XML_REGEXP_NOTSPACE:
970
fprintf(output, "notspace "); break;
971
case XML_REGEXP_INITNAME:
972
fprintf(output, "initname "); break;
973
case XML_REGEXP_NOTINITNAME:
974
fprintf(output, "notinitname "); break;
975
case XML_REGEXP_NAMECHAR:
976
fprintf(output, "namechar "); break;
977
case XML_REGEXP_NOTNAMECHAR:
978
fprintf(output, "notnamechar "); break;
979
case XML_REGEXP_DECIMAL:
980
fprintf(output, "decimal "); break;
981
case XML_REGEXP_NOTDECIMAL:
982
fprintf(output, "notdecimal "); break;
983
case XML_REGEXP_REALCHAR:
984
fprintf(output, "realchar "); break;
985
case XML_REGEXP_NOTREALCHAR:
986
fprintf(output, "notrealchar "); break;
987
case XML_REGEXP_LETTER:
988
fprintf(output, "LETTER "); break;
989
case XML_REGEXP_LETTER_UPPERCASE:
990
fprintf(output, "LETTER_UPPERCASE "); break;
991
case XML_REGEXP_LETTER_LOWERCASE:
992
fprintf(output, "LETTER_LOWERCASE "); break;
993
case XML_REGEXP_LETTER_TITLECASE:
994
fprintf(output, "LETTER_TITLECASE "); break;
995
case XML_REGEXP_LETTER_MODIFIER:
996
fprintf(output, "LETTER_MODIFIER "); break;
997
case XML_REGEXP_LETTER_OTHERS:
998
fprintf(output, "LETTER_OTHERS "); break;
999
case XML_REGEXP_MARK:
1000
fprintf(output, "MARK "); break;
1001
case XML_REGEXP_MARK_NONSPACING:
1002
fprintf(output, "MARK_NONSPACING "); break;
1003
case XML_REGEXP_MARK_SPACECOMBINING:
1004
fprintf(output, "MARK_SPACECOMBINING "); break;
1005
case XML_REGEXP_MARK_ENCLOSING:
1006
fprintf(output, "MARK_ENCLOSING "); break;
1007
case XML_REGEXP_NUMBER:
1008
fprintf(output, "NUMBER "); break;
1009
case XML_REGEXP_NUMBER_DECIMAL:
1010
fprintf(output, "NUMBER_DECIMAL "); break;
1011
case XML_REGEXP_NUMBER_LETTER:
1012
fprintf(output, "NUMBER_LETTER "); break;
1013
case XML_REGEXP_NUMBER_OTHERS:
1014
fprintf(output, "NUMBER_OTHERS "); break;
1015
case XML_REGEXP_PUNCT:
1016
fprintf(output, "PUNCT "); break;
1017
case XML_REGEXP_PUNCT_CONNECTOR:
1018
fprintf(output, "PUNCT_CONNECTOR "); break;
1019
case XML_REGEXP_PUNCT_DASH:
1020
fprintf(output, "PUNCT_DASH "); break;
1021
case XML_REGEXP_PUNCT_OPEN:
1022
fprintf(output, "PUNCT_OPEN "); break;
1023
case XML_REGEXP_PUNCT_CLOSE:
1024
fprintf(output, "PUNCT_CLOSE "); break;
1025
case XML_REGEXP_PUNCT_INITQUOTE:
1026
fprintf(output, "PUNCT_INITQUOTE "); break;
1027
case XML_REGEXP_PUNCT_FINQUOTE:
1028
fprintf(output, "PUNCT_FINQUOTE "); break;
1029
case XML_REGEXP_PUNCT_OTHERS:
1030
fprintf(output, "PUNCT_OTHERS "); break;
1031
case XML_REGEXP_SEPAR:
1032
fprintf(output, "SEPAR "); break;
1033
case XML_REGEXP_SEPAR_SPACE:
1034
fprintf(output, "SEPAR_SPACE "); break;
1035
case XML_REGEXP_SEPAR_LINE:
1036
fprintf(output, "SEPAR_LINE "); break;
1037
case XML_REGEXP_SEPAR_PARA:
1038
fprintf(output, "SEPAR_PARA "); break;
1039
case XML_REGEXP_SYMBOL:
1040
fprintf(output, "SYMBOL "); break;
1041
case XML_REGEXP_SYMBOL_MATH:
1042
fprintf(output, "SYMBOL_MATH "); break;
1043
case XML_REGEXP_SYMBOL_CURRENCY:
1044
fprintf(output, "SYMBOL_CURRENCY "); break;
1045
case XML_REGEXP_SYMBOL_MODIFIER:
1046
fprintf(output, "SYMBOL_MODIFIER "); break;
1047
case XML_REGEXP_SYMBOL_OTHERS:
1048
fprintf(output, "SYMBOL_OTHERS "); break;
1049
case XML_REGEXP_OTHER:
1050
fprintf(output, "OTHER "); break;
1051
case XML_REGEXP_OTHER_CONTROL:
1052
fprintf(output, "OTHER_CONTROL "); break;
1053
case XML_REGEXP_OTHER_FORMAT:
1054
fprintf(output, "OTHER_FORMAT "); break;
1055
case XML_REGEXP_OTHER_PRIVATE:
1056
fprintf(output, "OTHER_PRIVATE "); break;
1057
case XML_REGEXP_OTHER_NA:
1058
fprintf(output, "OTHER_NA "); break;
1059
case XML_REGEXP_BLOCK_NAME:
1060
fprintf(output, "BLOCK "); break;
1061
}
1062
}
1063
1064
static void
1065
xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1066
switch (type) {
1067
case XML_REGEXP_QUANT_EPSILON:
1068
fprintf(output, "epsilon "); break;
1069
case XML_REGEXP_QUANT_ONCE:
1070
fprintf(output, "once "); break;
1071
case XML_REGEXP_QUANT_OPT:
1072
fprintf(output, "? "); break;
1073
case XML_REGEXP_QUANT_MULT:
1074
fprintf(output, "* "); break;
1075
case XML_REGEXP_QUANT_PLUS:
1076
fprintf(output, "+ "); break;
1077
case XML_REGEXP_QUANT_RANGE:
1078
fprintf(output, "range "); break;
1079
case XML_REGEXP_QUANT_ONCEONLY:
1080
fprintf(output, "onceonly "); break;
1081
case XML_REGEXP_QUANT_ALL:
1082
fprintf(output, "all "); break;
1083
}
1084
}
1085
static void
1086
xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1087
fprintf(output, " range: ");
1088
if (range->neg)
1089
fprintf(output, "negative ");
1090
xmlRegPrintAtomType(output, range->type);
1091
fprintf(output, "%c - %c\n", range->start, range->end);
1092
}
1093
1094
static void
1095
xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1096
fprintf(output, " atom: ");
1097
if (atom == NULL) {
1098
fprintf(output, "NULL\n");
1099
return;
1100
}
1101
if (atom->neg)
1102
fprintf(output, "not ");
1103
xmlRegPrintAtomType(output, atom->type);
1104
xmlRegPrintQuantType(output, atom->quant);
1105
if (atom->quant == XML_REGEXP_QUANT_RANGE)
1106
fprintf(output, "%d-%d ", atom->min, atom->max);
1107
if (atom->type == XML_REGEXP_STRING)
1108
fprintf(output, "'%s' ", (char *) atom->valuep);
1109
if (atom->type == XML_REGEXP_CHARVAL)
1110
fprintf(output, "char %c\n", atom->codepoint);
1111
else if (atom->type == XML_REGEXP_RANGES) {
1112
int i;
1113
fprintf(output, "%d entries\n", atom->nbRanges);
1114
for (i = 0; i < atom->nbRanges;i++)
1115
xmlRegPrintRange(output, atom->ranges[i]);
1116
} else if (atom->type == XML_REGEXP_SUBREG) {
1117
fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1118
} else {
1119
fprintf(output, "\n");
1120
}
1121
}
1122
1123
static void
1124
xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1125
fprintf(output, " trans: ");
1126
if (trans == NULL) {
1127
fprintf(output, "NULL\n");
1128
return;
1129
}
1130
if (trans->to < 0) {
1131
fprintf(output, "removed\n");
1132
return;
1133
}
1134
if (trans->nd != 0) {
1135
if (trans->nd == 2)
1136
fprintf(output, "last not determinist, ");
1137
else
1138
fprintf(output, "not determinist, ");
1139
}
1140
if (trans->counter >= 0) {
1141
fprintf(output, "counted %d, ", trans->counter);
1142
}
1143
if (trans->count == REGEXP_ALL_COUNTER) {
1144
fprintf(output, "all transition, ");
1145
} else if (trans->count >= 0) {
1146
fprintf(output, "count based %d, ", trans->count);
1147
}
1148
if (trans->atom == NULL) {
1149
fprintf(output, "epsilon to %d\n", trans->to);
1150
return;
1151
}
1152
if (trans->atom->type == XML_REGEXP_CHARVAL)
1153
fprintf(output, "char %c ", trans->atom->codepoint);
1154
fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1155
}
1156
1157
static void
1158
xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1159
int i;
1160
1161
fprintf(output, " state: ");
1162
if (state == NULL) {
1163
fprintf(output, "NULL\n");
1164
return;
1165
}
1166
if (state->type == XML_REGEXP_START_STATE)
1167
fprintf(output, "START ");
1168
if (state->type == XML_REGEXP_FINAL_STATE)
1169
fprintf(output, "FINAL ");
1170
1171
fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1172
for (i = 0;i < state->nbTrans; i++) {
1173
xmlRegPrintTrans(output, &(state->trans[i]));
1174
}
1175
}
1176
1177
/************************************************************************
1178
* *
1179
* Finite Automata structures manipulations *
1180
* *
1181
************************************************************************/
1182
1183
static xmlRegRangePtr
1184
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1185
int neg, xmlRegAtomType type, int start, int end,
1186
xmlChar *blockName) {
1187
xmlRegRangePtr range;
1188
1189
if (atom == NULL) {
1190
ERROR("add range: atom is NULL");
1191
return(NULL);
1192
}
1193
if (atom->type != XML_REGEXP_RANGES) {
1194
ERROR("add range: atom is not ranges");
1195
return(NULL);
1196
}
1197
if (atom->maxRanges == 0) {
1198
atom->maxRanges = 4;
1199
atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1200
sizeof(xmlRegRangePtr));
1201
if (atom->ranges == NULL) {
1202
xmlRegexpErrMemory(ctxt, "adding ranges");
1203
atom->maxRanges = 0;
1204
return(NULL);
1205
}
1206
} else if (atom->nbRanges >= atom->maxRanges) {
1207
xmlRegRangePtr *tmp;
1208
atom->maxRanges *= 2;
1209
tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1210
sizeof(xmlRegRangePtr));
1211
if (tmp == NULL) {
1212
xmlRegexpErrMemory(ctxt, "adding ranges");
1213
atom->maxRanges /= 2;
1214
return(NULL);
1215
}
1216
atom->ranges = tmp;
1217
}
1218
range = xmlRegNewRange(ctxt, neg, type, start, end);
1219
if (range == NULL)
1220
return(NULL);
1221
range->blockName = blockName;
1222
atom->ranges[atom->nbRanges++] = range;
1223
1224
return(range);
1225
}
1226
1227
static int
1228
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1229
if (ctxt->maxCounters == 0) {
1230
ctxt->maxCounters = 4;
1231
ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1232
sizeof(xmlRegCounter));
1233
if (ctxt->counters == NULL) {
1234
xmlRegexpErrMemory(ctxt, "allocating counter");
1235
ctxt->maxCounters = 0;
1236
return(-1);
1237
}
1238
} else if (ctxt->nbCounters >= ctxt->maxCounters) {
1239
xmlRegCounter *tmp;
1240
ctxt->maxCounters *= 2;
1241
tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1242
sizeof(xmlRegCounter));
1243
if (tmp == NULL) {
1244
xmlRegexpErrMemory(ctxt, "allocating counter");
1245
ctxt->maxCounters /= 2;
1246
return(-1);
1247
}
1248
ctxt->counters = tmp;
1249
}
1250
ctxt->counters[ctxt->nbCounters].min = -1;
1251
ctxt->counters[ctxt->nbCounters].max = -1;
1252
return(ctxt->nbCounters++);
1253
}
1254
1255
static int
1256
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1257
if (atom == NULL) {
1258
ERROR("atom push: atom is NULL");
1259
return(-1);
1260
}
1261
if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1262
size_t newSize = ctxt->maxAtoms ? ctxt->maxAtoms * 2 : 4;
1263
xmlRegAtomPtr *tmp;
1264
1265
tmp = xmlRealloc(ctxt->atoms, newSize * sizeof(xmlRegAtomPtr));
1266
if (tmp == NULL) {
1267
xmlRegexpErrMemory(ctxt, "allocating counter");
1268
return(-1);
1269
}
1270
ctxt->atoms = tmp;
1271
ctxt->maxAtoms = newSize;
1272
}
1273
atom->no = ctxt->nbAtoms;
1274
ctxt->atoms[ctxt->nbAtoms++] = atom;
1275
return(0);
1276
}
1277
1278
static void
1279
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1280
int from) {
1281
if (target->maxTransTo == 0) {
1282
target->maxTransTo = 8;
1283
target->transTo = (int *) xmlMalloc(target->maxTransTo *
1284
sizeof(int));
1285
if (target->transTo == NULL) {
1286
xmlRegexpErrMemory(ctxt, "adding transition");
1287
target->maxTransTo = 0;
1288
return;
1289
}
1290
} else if (target->nbTransTo >= target->maxTransTo) {
1291
int *tmp;
1292
target->maxTransTo *= 2;
1293
tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1294
sizeof(int));
1295
if (tmp == NULL) {
1296
xmlRegexpErrMemory(ctxt, "adding transition");
1297
target->maxTransTo /= 2;
1298
return;
1299
}
1300
target->transTo = tmp;
1301
}
1302
target->transTo[target->nbTransTo] = from;
1303
target->nbTransTo++;
1304
}
1305
1306
static void
1307
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1308
xmlRegAtomPtr atom, xmlRegStatePtr target,
1309
int counter, int count) {
1310
1311
int nrtrans;
1312
1313
if (state == NULL) {
1314
ERROR("add state: state is NULL");
1315
return;
1316
}
1317
if (target == NULL) {
1318
ERROR("add state: target is NULL");
1319
return;
1320
}
1321
/*
1322
* Other routines follow the philosophy 'When in doubt, add a transition'
1323
* so we check here whether such a transition is already present and, if
1324
* so, silently ignore this request.
1325
*/
1326
1327
for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1328
xmlRegTransPtr trans = &(state->trans[nrtrans]);
1329
if ((trans->atom == atom) &&
1330
(trans->to == target->no) &&
1331
(trans->counter == counter) &&
1332
(trans->count == count)) {
1333
return;
1334
}
1335
}
1336
1337
if (state->maxTrans == 0) {
1338
state->maxTrans = 8;
1339
state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1340
sizeof(xmlRegTrans));
1341
if (state->trans == NULL) {
1342
xmlRegexpErrMemory(ctxt, "adding transition");
1343
state->maxTrans = 0;
1344
return;
1345
}
1346
} else if (state->nbTrans >= state->maxTrans) {
1347
xmlRegTrans *tmp;
1348
state->maxTrans *= 2;
1349
tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1350
sizeof(xmlRegTrans));
1351
if (tmp == NULL) {
1352
xmlRegexpErrMemory(ctxt, "adding transition");
1353
state->maxTrans /= 2;
1354
return;
1355
}
1356
state->trans = tmp;
1357
}
1358
1359
state->trans[state->nbTrans].atom = atom;
1360
state->trans[state->nbTrans].to = target->no;
1361
state->trans[state->nbTrans].counter = counter;
1362
state->trans[state->nbTrans].count = count;
1363
state->trans[state->nbTrans].nd = 0;
1364
state->nbTrans++;
1365
xmlRegStateAddTransTo(ctxt, target, state->no);
1366
}
1367
1368
static xmlRegStatePtr
1369
xmlRegStatePush(xmlRegParserCtxtPtr ctxt) {
1370
xmlRegStatePtr state;
1371
1372
if (ctxt->nbStates >= ctxt->maxStates) {
1373
size_t newSize = ctxt->maxStates ? ctxt->maxStates * 2 : 4;
1374
xmlRegStatePtr *tmp;
1375
1376
tmp = xmlRealloc(ctxt->states, newSize * sizeof(tmp[0]));
1377
if (tmp == NULL) {
1378
xmlRegexpErrMemory(ctxt, "adding state");
1379
return(NULL);
1380
}
1381
ctxt->states = tmp;
1382
ctxt->maxStates = newSize;
1383
}
1384
1385
state = xmlRegNewState(ctxt);
1386
if (state == NULL)
1387
return(NULL);
1388
1389
state->no = ctxt->nbStates;
1390
ctxt->states[ctxt->nbStates++] = state;
1391
1392
return(state);
1393
}
1394
1395
/**
1396
* xmlFAGenerateAllTransition:
1397
* @ctxt: a regexp parser context
1398
* @from: the from state
1399
* @to: the target state or NULL for building a new one
1400
* @lax:
1401
*
1402
*/
1403
static int
1404
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1405
xmlRegStatePtr from, xmlRegStatePtr to,
1406
int lax) {
1407
if (to == NULL) {
1408
to = xmlRegStatePush(ctxt);
1409
if (to == NULL)
1410
return(-1);
1411
ctxt->state = to;
1412
}
1413
if (lax)
1414
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1415
else
1416
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1417
return(0);
1418
}
1419
1420
/**
1421
* xmlFAGenerateEpsilonTransition:
1422
* @ctxt: a regexp parser context
1423
* @from: the from state
1424
* @to: the target state or NULL for building a new one
1425
*
1426
*/
1427
static int
1428
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1429
xmlRegStatePtr from, xmlRegStatePtr to) {
1430
if (to == NULL) {
1431
to = xmlRegStatePush(ctxt);
1432
if (to == NULL)
1433
return(-1);
1434
ctxt->state = to;
1435
}
1436
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1437
return(0);
1438
}
1439
1440
/**
1441
* xmlFAGenerateCountedEpsilonTransition:
1442
* @ctxt: a regexp parser context
1443
* @from: the from state
1444
* @to: the target state or NULL for building a new one
1445
* counter: the counter for that transition
1446
*
1447
*/
1448
static int
1449
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1450
xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1451
if (to == NULL) {
1452
to = xmlRegStatePush(ctxt);
1453
if (to == NULL)
1454
return(-1);
1455
ctxt->state = to;
1456
}
1457
xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1458
return(0);
1459
}
1460
1461
/**
1462
* xmlFAGenerateCountedTransition:
1463
* @ctxt: a regexp parser context
1464
* @from: the from state
1465
* @to: the target state or NULL for building a new one
1466
* counter: the counter for that transition
1467
*
1468
*/
1469
static int
1470
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1471
xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1472
if (to == NULL) {
1473
to = xmlRegStatePush(ctxt);
1474
if (to == NULL)
1475
return(-1);
1476
ctxt->state = to;
1477
}
1478
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1479
return(0);
1480
}
1481
1482
/**
1483
* xmlFAGenerateTransitions:
1484
* @ctxt: a regexp parser context
1485
* @from: the from state
1486
* @to: the target state or NULL for building a new one
1487
* @atom: the atom generating the transition
1488
*
1489
* Returns 0 if success and -1 in case of error.
1490
*/
1491
static int
1492
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1493
xmlRegStatePtr to, xmlRegAtomPtr atom) {
1494
xmlRegStatePtr end;
1495
int nullable = 0;
1496
1497
if (atom == NULL) {
1498
ERROR("generate transition: atom == NULL");
1499
return(-1);
1500
}
1501
if (atom->type == XML_REGEXP_SUBREG) {
1502
/*
1503
* this is a subexpression handling one should not need to
1504
* create a new node except for XML_REGEXP_QUANT_RANGE.
1505
*/
1506
if ((to != NULL) && (atom->stop != to) &&
1507
(atom->quant != XML_REGEXP_QUANT_RANGE)) {
1508
/*
1509
* Generate an epsilon transition to link to the target
1510
*/
1511
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1512
#ifdef DV
1513
} else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1514
(atom->quant != XML_REGEXP_QUANT_ONCE)) {
1515
to = xmlRegStatePush(ctxt, to);
1516
if (to == NULL)
1517
return(-1);
1518
ctxt->state = to;
1519
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1520
#endif
1521
}
1522
switch (atom->quant) {
1523
case XML_REGEXP_QUANT_OPT:
1524
atom->quant = XML_REGEXP_QUANT_ONCE;
1525
/*
1526
* transition done to the state after end of atom.
1527
* 1. set transition from atom start to new state
1528
* 2. set transition from atom end to this state.
1529
*/
1530
if (to == NULL) {
1531
xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1532
xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1533
ctxt->state);
1534
} else {
1535
xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1536
}
1537
break;
1538
case XML_REGEXP_QUANT_MULT:
1539
atom->quant = XML_REGEXP_QUANT_ONCE;
1540
xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1541
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1542
break;
1543
case XML_REGEXP_QUANT_PLUS:
1544
atom->quant = XML_REGEXP_QUANT_ONCE;
1545
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1546
break;
1547
case XML_REGEXP_QUANT_RANGE: {
1548
int counter;
1549
xmlRegStatePtr inter, newstate;
1550
1551
/*
1552
* create the final state now if needed
1553
*/
1554
if (to != NULL) {
1555
newstate = to;
1556
} else {
1557
newstate = xmlRegStatePush(ctxt);
1558
if (newstate == NULL)
1559
return(-1);
1560
}
1561
1562
/*
1563
* The principle here is to use counted transition
1564
* to avoid explosion in the number of states in the
1565
* graph. This is clearly more complex but should not
1566
* be exploitable at runtime.
1567
*/
1568
if ((atom->min == 0) && (atom->start0 == NULL)) {
1569
xmlRegAtomPtr copy;
1570
/*
1571
* duplicate a transition based on atom to count next
1572
* occurrences after 1. We cannot loop to atom->start
1573
* directly because we need an epsilon transition to
1574
* newstate.
1575
*/
1576
/* ???? For some reason it seems we never reach that
1577
case, I suppose this got optimized out before when
1578
building the automata */
1579
copy = xmlRegCopyAtom(ctxt, atom);
1580
if (copy == NULL)
1581
return(-1);
1582
copy->quant = XML_REGEXP_QUANT_ONCE;
1583
copy->min = 0;
1584
copy->max = 0;
1585
1586
if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1587
< 0) {
1588
xmlRegFreeAtom(copy);
1589
return(-1);
1590
}
1591
inter = ctxt->state;
1592
counter = xmlRegGetCounter(ctxt);
1593
if (counter < 0)
1594
return(-1);
1595
ctxt->counters[counter].min = atom->min - 1;
1596
ctxt->counters[counter].max = atom->max - 1;
1597
/* count the number of times we see it again */
1598
xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1599
atom->stop, counter);
1600
/* allow a way out based on the count */
1601
xmlFAGenerateCountedTransition(ctxt, inter,
1602
newstate, counter);
1603
/* and also allow a direct exit for 0 */
1604
xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1605
newstate);
1606
} else {
1607
/*
1608
* either we need the atom at least once or there
1609
* is an atom->start0 allowing to easily plug the
1610
* epsilon transition.
1611
*/
1612
counter = xmlRegGetCounter(ctxt);
1613
if (counter < 0)
1614
return(-1);
1615
ctxt->counters[counter].min = atom->min - 1;
1616
ctxt->counters[counter].max = atom->max - 1;
1617
/* allow a way out based on the count */
1618
xmlFAGenerateCountedTransition(ctxt, atom->stop,
1619
newstate, counter);
1620
/* count the number of times we see it again */
1621
xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1622
atom->start, counter);
1623
/* and if needed allow a direct exit for 0 */
1624
if (atom->min == 0)
1625
xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1626
newstate);
1627
1628
}
1629
atom->min = 0;
1630
atom->max = 0;
1631
atom->quant = XML_REGEXP_QUANT_ONCE;
1632
ctxt->state = newstate;
1633
}
1634
default:
1635
break;
1636
}
1637
if (xmlRegAtomPush(ctxt, atom) < 0)
1638
return(-1);
1639
return(0);
1640
}
1641
if ((atom->min == 0) && (atom->max == 0) &&
1642
(atom->quant == XML_REGEXP_QUANT_RANGE)) {
1643
/*
1644
* we can discard the atom and generate an epsilon transition instead
1645
*/
1646
if (to == NULL) {
1647
to = xmlRegStatePush(ctxt);
1648
if (to == NULL)
1649
return(-1);
1650
}
1651
xmlFAGenerateEpsilonTransition(ctxt, from, to);
1652
ctxt->state = to;
1653
xmlRegFreeAtom(atom);
1654
return(0);
1655
}
1656
if (to == NULL) {
1657
to = xmlRegStatePush(ctxt);
1658
if (to == NULL)
1659
return(-1);
1660
}
1661
end = to;
1662
if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1663
(atom->quant == XML_REGEXP_QUANT_PLUS)) {
1664
/*
1665
* Do not pollute the target state by adding transitions from
1666
* it as it is likely to be the shared target of multiple branches.
1667
* So isolate with an epsilon transition.
1668
*/
1669
xmlRegStatePtr tmp;
1670
1671
tmp = xmlRegStatePush(ctxt);
1672
if (tmp == NULL)
1673
return(-1);
1674
xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1675
to = tmp;
1676
}
1677
if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1678
(atom->min == 0) && (atom->max > 0)) {
1679
nullable = 1;
1680
atom->min = 1;
1681
if (atom->max == 1)
1682
atom->quant = XML_REGEXP_QUANT_OPT;
1683
}
1684
xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1685
ctxt->state = end;
1686
switch (atom->quant) {
1687
case XML_REGEXP_QUANT_OPT:
1688
atom->quant = XML_REGEXP_QUANT_ONCE;
1689
xmlFAGenerateEpsilonTransition(ctxt, from, to);
1690
break;
1691
case XML_REGEXP_QUANT_MULT:
1692
atom->quant = XML_REGEXP_QUANT_ONCE;
1693
xmlFAGenerateEpsilonTransition(ctxt, from, to);
1694
xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1695
break;
1696
case XML_REGEXP_QUANT_PLUS:
1697
atom->quant = XML_REGEXP_QUANT_ONCE;
1698
xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1699
break;
1700
case XML_REGEXP_QUANT_RANGE:
1701
if (nullable)
1702
xmlFAGenerateEpsilonTransition(ctxt, from, to);
1703
break;
1704
default:
1705
break;
1706
}
1707
if (xmlRegAtomPush(ctxt, atom) < 0)
1708
return(-1);
1709
return(0);
1710
}
1711
1712
/**
1713
* xmlFAReduceEpsilonTransitions:
1714
* @ctxt: a regexp parser context
1715
* @fromnr: the from state
1716
* @tonr: the to state
1717
* @counter: should that transition be associated to a counted
1718
*
1719
*/
1720
static void
1721
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1722
int tonr, int counter) {
1723
int transnr;
1724
xmlRegStatePtr from;
1725
xmlRegStatePtr to;
1726
1727
from = ctxt->states[fromnr];
1728
if (from == NULL)
1729
return;
1730
to = ctxt->states[tonr];
1731
if (to == NULL)
1732
return;
1733
if ((to->mark == XML_REGEXP_MARK_START) ||
1734
(to->mark == XML_REGEXP_MARK_VISITED))
1735
return;
1736
1737
to->mark = XML_REGEXP_MARK_VISITED;
1738
if (to->type == XML_REGEXP_FINAL_STATE) {
1739
from->type = XML_REGEXP_FINAL_STATE;
1740
}
1741
for (transnr = 0;transnr < to->nbTrans;transnr++) {
1742
xmlRegTransPtr t1 = &to->trans[transnr];
1743
int tcounter;
1744
1745
if (t1->to < 0)
1746
continue;
1747
if (t1->counter >= 0) {
1748
/* assert(counter < 0); */
1749
tcounter = t1->counter;
1750
} else {
1751
tcounter = counter;
1752
}
1753
if (t1->atom == NULL) {
1754
/*
1755
* Don't remove counted transitions
1756
* Don't loop either
1757
*/
1758
if (t1->to != fromnr) {
1759
if (t1->count >= 0) {
1760
xmlRegStateAddTrans(ctxt, from, NULL, ctxt->states[t1->to],
1761
-1, t1->count);
1762
} else {
1763
xmlFAReduceEpsilonTransitions(ctxt, fromnr, t1->to,
1764
tcounter);
1765
}
1766
}
1767
} else {
1768
xmlRegStateAddTrans(ctxt, from, t1->atom,
1769
ctxt->states[t1->to], tcounter, -1);
1770
}
1771
}
1772
}
1773
1774
/**
1775
* xmlFAFinishReduceEpsilonTransitions:
1776
* @ctxt: a regexp parser context
1777
* @fromnr: the from state
1778
* @tonr: the to state
1779
* @counter: should that transition be associated to a counted
1780
*
1781
*/
1782
static void
1783
xmlFAFinishReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int tonr) {
1784
int transnr;
1785
xmlRegStatePtr to;
1786
1787
to = ctxt->states[tonr];
1788
if (to == NULL)
1789
return;
1790
if ((to->mark == XML_REGEXP_MARK_START) ||
1791
(to->mark == XML_REGEXP_MARK_NORMAL))
1792
return;
1793
1794
to->mark = XML_REGEXP_MARK_NORMAL;
1795
for (transnr = 0;transnr < to->nbTrans;transnr++) {
1796
xmlRegTransPtr t1 = &to->trans[transnr];
1797
if ((t1->to >= 0) && (t1->atom == NULL))
1798
xmlFAFinishReduceEpsilonTransitions(ctxt, t1->to);
1799
}
1800
}
1801
1802
/**
1803
* xmlFAEliminateSimpleEpsilonTransitions:
1804
* @ctxt: a regexp parser context
1805
*
1806
* Eliminating general epsilon transitions can get costly in the general
1807
* algorithm due to the large amount of generated new transitions and
1808
* associated comparisons. However for simple epsilon transition used just
1809
* to separate building blocks when generating the automata this can be
1810
* reduced to state elimination:
1811
* - if there exists an epsilon from X to Y
1812
* - if there is no other transition from X
1813
* then X and Y are semantically equivalent and X can be eliminated
1814
* If X is the start state then make Y the start state, else replace the
1815
* target of all transitions to X by transitions to Y.
1816
*
1817
* If X is a final state, skip it.
1818
* Otherwise it would be necessary to manipulate counters for this case when
1819
* eliminating state 2:
1820
* State 1 has a transition with an atom to state 2.
1821
* State 2 is final and has an epsilon transition to state 1.
1822
*/
1823
static void
1824
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1825
int statenr, i, j, newto;
1826
xmlRegStatePtr state, tmp;
1827
1828
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1829
state = ctxt->states[statenr];
1830
if (state == NULL)
1831
continue;
1832
if (state->nbTrans != 1)
1833
continue;
1834
if (state->type == XML_REGEXP_UNREACH_STATE ||
1835
state->type == XML_REGEXP_FINAL_STATE)
1836
continue;
1837
/* is the only transition out a basic transition */
1838
if ((state->trans[0].atom == NULL) &&
1839
(state->trans[0].to >= 0) &&
1840
(state->trans[0].to != statenr) &&
1841
(state->trans[0].counter < 0) &&
1842
(state->trans[0].count < 0)) {
1843
newto = state->trans[0].to;
1844
1845
if (state->type == XML_REGEXP_START_STATE) {
1846
} else {
1847
for (i = 0;i < state->nbTransTo;i++) {
1848
tmp = ctxt->states[state->transTo[i]];
1849
for (j = 0;j < tmp->nbTrans;j++) {
1850
if (tmp->trans[j].to == statenr) {
1851
tmp->trans[j].to = -1;
1852
xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1853
ctxt->states[newto],
1854
tmp->trans[j].counter,
1855
tmp->trans[j].count);
1856
}
1857
}
1858
}
1859
if (state->type == XML_REGEXP_FINAL_STATE)
1860
ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1861
/* eliminate the transition completely */
1862
state->nbTrans = 0;
1863
1864
state->type = XML_REGEXP_UNREACH_STATE;
1865
1866
}
1867
1868
}
1869
}
1870
}
1871
/**
1872
* xmlFAEliminateEpsilonTransitions:
1873
* @ctxt: a regexp parser context
1874
*
1875
*/
1876
static void
1877
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1878
int statenr, transnr;
1879
xmlRegStatePtr state;
1880
int has_epsilon;
1881
1882
if (ctxt->states == NULL) return;
1883
1884
/*
1885
* Eliminate simple epsilon transition and the associated unreachable
1886
* states.
1887
*/
1888
xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1889
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1890
state = ctxt->states[statenr];
1891
if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1892
xmlRegFreeState(state);
1893
ctxt->states[statenr] = NULL;
1894
}
1895
}
1896
1897
has_epsilon = 0;
1898
1899
/*
1900
* Build the completed transitions bypassing the epsilons
1901
* Use a marking algorithm to avoid loops
1902
* Mark sink states too.
1903
* Process from the latest states backward to the start when
1904
* there is long cascading epsilon chains this minimize the
1905
* recursions and transition compares when adding the new ones
1906
*/
1907
for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1908
state = ctxt->states[statenr];
1909
if (state == NULL)
1910
continue;
1911
if ((state->nbTrans == 0) &&
1912
(state->type != XML_REGEXP_FINAL_STATE)) {
1913
state->type = XML_REGEXP_SINK_STATE;
1914
}
1915
for (transnr = 0;transnr < state->nbTrans;transnr++) {
1916
if ((state->trans[transnr].atom == NULL) &&
1917
(state->trans[transnr].to >= 0)) {
1918
if (state->trans[transnr].to == statenr) {
1919
state->trans[transnr].to = -1;
1920
} else if (state->trans[transnr].count < 0) {
1921
int newto = state->trans[transnr].to;
1922
1923
has_epsilon = 1;
1924
state->trans[transnr].to = -2;
1925
state->mark = XML_REGEXP_MARK_START;
1926
xmlFAReduceEpsilonTransitions(ctxt, statenr,
1927
newto, state->trans[transnr].counter);
1928
xmlFAFinishReduceEpsilonTransitions(ctxt, newto);
1929
state->mark = XML_REGEXP_MARK_NORMAL;
1930
}
1931
}
1932
}
1933
}
1934
/*
1935
* Eliminate the epsilon transitions
1936
*/
1937
if (has_epsilon) {
1938
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1939
state = ctxt->states[statenr];
1940
if (state == NULL)
1941
continue;
1942
for (transnr = 0;transnr < state->nbTrans;transnr++) {
1943
xmlRegTransPtr trans = &(state->trans[transnr]);
1944
if ((trans->atom == NULL) &&
1945
(trans->count < 0) &&
1946
(trans->to >= 0)) {
1947
trans->to = -1;
1948
}
1949
}
1950
}
1951
}
1952
1953
/*
1954
* Use this pass to detect unreachable states too
1955
*/
1956
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1957
state = ctxt->states[statenr];
1958
if (state != NULL)
1959
state->reached = XML_REGEXP_MARK_NORMAL;
1960
}
1961
state = ctxt->states[0];
1962
if (state != NULL)
1963
state->reached = XML_REGEXP_MARK_START;
1964
while (state != NULL) {
1965
xmlRegStatePtr target = NULL;
1966
state->reached = XML_REGEXP_MARK_VISITED;
1967
/*
1968
* Mark all states reachable from the current reachable state
1969
*/
1970
for (transnr = 0;transnr < state->nbTrans;transnr++) {
1971
if ((state->trans[transnr].to >= 0) &&
1972
((state->trans[transnr].atom != NULL) ||
1973
(state->trans[transnr].count >= 0))) {
1974
int newto = state->trans[transnr].to;
1975
1976
if (ctxt->states[newto] == NULL)
1977
continue;
1978
if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1979
ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
1980
target = ctxt->states[newto];
1981
}
1982
}
1983
}
1984
1985
/*
1986
* find the next accessible state not explored
1987
*/
1988
if (target == NULL) {
1989
for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1990
state = ctxt->states[statenr];
1991
if ((state != NULL) && (state->reached ==
1992
XML_REGEXP_MARK_START)) {
1993
target = state;
1994
break;
1995
}
1996
}
1997
}
1998
state = target;
1999
}
2000
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2001
state = ctxt->states[statenr];
2002
if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2003
xmlRegFreeState(state);
2004
ctxt->states[statenr] = NULL;
2005
}
2006
}
2007
2008
}
2009
2010
static int
2011
xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2012
int ret = 0;
2013
2014
if ((range1->type == XML_REGEXP_RANGES) ||
2015
(range2->type == XML_REGEXP_RANGES) ||
2016
(range2->type == XML_REGEXP_SUBREG) ||
2017
(range1->type == XML_REGEXP_SUBREG) ||
2018
(range1->type == XML_REGEXP_STRING) ||
2019
(range2->type == XML_REGEXP_STRING))
2020
return(-1);
2021
2022
/* put them in order */
2023
if (range1->type > range2->type) {
2024
xmlRegRangePtr tmp;
2025
2026
tmp = range1;
2027
range1 = range2;
2028
range2 = tmp;
2029
}
2030
if ((range1->type == XML_REGEXP_ANYCHAR) ||
2031
(range2->type == XML_REGEXP_ANYCHAR)) {
2032
ret = 1;
2033
} else if ((range1->type == XML_REGEXP_EPSILON) ||
2034
(range2->type == XML_REGEXP_EPSILON)) {
2035
return(0);
2036
} else if (range1->type == range2->type) {
2037
if (range1->type != XML_REGEXP_CHARVAL)
2038
ret = 1;
2039
else if ((range1->end < range2->start) ||
2040
(range2->end < range1->start))
2041
ret = 0;
2042
else
2043
ret = 1;
2044
} else if (range1->type == XML_REGEXP_CHARVAL) {
2045
int codepoint;
2046
int neg = 0;
2047
2048
/*
2049
* just check all codepoints in the range for acceptance,
2050
* this is usually way cheaper since done only once at
2051
* compilation than testing over and over at runtime or
2052
* pushing too many states when evaluating.
2053
*/
2054
if (((range1->neg == 0) && (range2->neg != 0)) ||
2055
((range1->neg != 0) && (range2->neg == 0)))
2056
neg = 1;
2057
2058
for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2059
ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2060
0, range2->start, range2->end,
2061
range2->blockName);
2062
if (ret < 0)
2063
return(-1);
2064
if (((neg == 1) && (ret == 0)) ||
2065
((neg == 0) && (ret == 1)))
2066
return(1);
2067
}
2068
return(0);
2069
} else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2070
(range2->type == XML_REGEXP_BLOCK_NAME)) {
2071
if (range1->type == range2->type) {
2072
ret = xmlStrEqual(range1->blockName, range2->blockName);
2073
} else {
2074
/*
2075
* comparing a block range with anything else is way
2076
* too costly, and maintaining the table is like too much
2077
* memory too, so let's force the automata to save state
2078
* here.
2079
*/
2080
return(1);
2081
}
2082
} else if ((range1->type < XML_REGEXP_LETTER) ||
2083
(range2->type < XML_REGEXP_LETTER)) {
2084
if ((range1->type == XML_REGEXP_ANYSPACE) &&
2085
(range2->type == XML_REGEXP_NOTSPACE))
2086
ret = 0;
2087
else if ((range1->type == XML_REGEXP_INITNAME) &&
2088
(range2->type == XML_REGEXP_NOTINITNAME))
2089
ret = 0;
2090
else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2091
(range2->type == XML_REGEXP_NOTNAMECHAR))
2092
ret = 0;
2093
else if ((range1->type == XML_REGEXP_DECIMAL) &&
2094
(range2->type == XML_REGEXP_NOTDECIMAL))
2095
ret = 0;
2096
else if ((range1->type == XML_REGEXP_REALCHAR) &&
2097
(range2->type == XML_REGEXP_NOTREALCHAR))
2098
ret = 0;
2099
else {
2100
/* same thing to limit complexity */
2101
return(1);
2102
}
2103
} else {
2104
ret = 0;
2105
/* range1->type < range2->type here */
2106
switch (range1->type) {
2107
case XML_REGEXP_LETTER:
2108
/* all disjoint except in the subgroups */
2109
if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2110
(range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2111
(range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2112
(range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2113
(range2->type == XML_REGEXP_LETTER_OTHERS))
2114
ret = 1;
2115
break;
2116
case XML_REGEXP_MARK:
2117
if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2118
(range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2119
(range2->type == XML_REGEXP_MARK_ENCLOSING))
2120
ret = 1;
2121
break;
2122
case XML_REGEXP_NUMBER:
2123
if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2124
(range2->type == XML_REGEXP_NUMBER_LETTER) ||
2125
(range2->type == XML_REGEXP_NUMBER_OTHERS))
2126
ret = 1;
2127
break;
2128
case XML_REGEXP_PUNCT:
2129
if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2130
(range2->type == XML_REGEXP_PUNCT_DASH) ||
2131
(range2->type == XML_REGEXP_PUNCT_OPEN) ||
2132
(range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2133
(range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2134
(range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2135
(range2->type == XML_REGEXP_PUNCT_OTHERS))
2136
ret = 1;
2137
break;
2138
case XML_REGEXP_SEPAR:
2139
if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2140
(range2->type == XML_REGEXP_SEPAR_LINE) ||
2141
(range2->type == XML_REGEXP_SEPAR_PARA))
2142
ret = 1;
2143
break;
2144
case XML_REGEXP_SYMBOL:
2145
if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2146
(range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2147
(range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2148
(range2->type == XML_REGEXP_SYMBOL_OTHERS))
2149
ret = 1;
2150
break;
2151
case XML_REGEXP_OTHER:
2152
if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2153
(range2->type == XML_REGEXP_OTHER_FORMAT) ||
2154
(range2->type == XML_REGEXP_OTHER_PRIVATE))
2155
ret = 1;
2156
break;
2157
default:
2158
if ((range2->type >= XML_REGEXP_LETTER) &&
2159
(range2->type < XML_REGEXP_BLOCK_NAME))
2160
ret = 0;
2161
else {
2162
/* safety net ! */
2163
return(1);
2164
}
2165
}
2166
}
2167
if (((range1->neg == 0) && (range2->neg != 0)) ||
2168
((range1->neg != 0) && (range2->neg == 0)))
2169
ret = !ret;
2170
return(ret);
2171
}
2172
2173
/**
2174
* xmlFACompareAtomTypes:
2175
* @type1: an atom type
2176
* @type2: an atom type
2177
*
2178
* Compares two atoms type to check whether they intersect in some ways,
2179
* this is used by xmlFACompareAtoms only
2180
*
2181
* Returns 1 if they may intersect and 0 otherwise
2182
*/
2183
static int
2184
xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2185
if ((type1 == XML_REGEXP_EPSILON) ||
2186
(type1 == XML_REGEXP_CHARVAL) ||
2187
(type1 == XML_REGEXP_RANGES) ||
2188
(type1 == XML_REGEXP_SUBREG) ||
2189
(type1 == XML_REGEXP_STRING) ||
2190
(type1 == XML_REGEXP_ANYCHAR))
2191
return(1);
2192
if ((type2 == XML_REGEXP_EPSILON) ||
2193
(type2 == XML_REGEXP_CHARVAL) ||
2194
(type2 == XML_REGEXP_RANGES) ||
2195
(type2 == XML_REGEXP_SUBREG) ||
2196
(type2 == XML_REGEXP_STRING) ||
2197
(type2 == XML_REGEXP_ANYCHAR))
2198
return(1);
2199
2200
if (type1 == type2) return(1);
2201
2202
/* simplify subsequent compares by making sure type1 < type2 */
2203
if (type1 > type2) {
2204
xmlRegAtomType tmp = type1;
2205
type1 = type2;
2206
type2 = tmp;
2207
}
2208
switch (type1) {
2209
case XML_REGEXP_ANYSPACE: /* \s */
2210
/* can't be a letter, number, mark, punctuation, symbol */
2211
if ((type2 == XML_REGEXP_NOTSPACE) ||
2212
((type2 >= XML_REGEXP_LETTER) &&
2213
(type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2214
((type2 >= XML_REGEXP_NUMBER) &&
2215
(type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2216
((type2 >= XML_REGEXP_MARK) &&
2217
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2218
((type2 >= XML_REGEXP_PUNCT) &&
2219
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2220
((type2 >= XML_REGEXP_SYMBOL) &&
2221
(type2 <= XML_REGEXP_SYMBOL_OTHERS))
2222
) return(0);
2223
break;
2224
case XML_REGEXP_NOTSPACE: /* \S */
2225
break;
2226
case XML_REGEXP_INITNAME: /* \l */
2227
/* can't be a number, mark, separator, punctuation, symbol or other */
2228
if ((type2 == XML_REGEXP_NOTINITNAME) ||
2229
((type2 >= XML_REGEXP_NUMBER) &&
2230
(type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2231
((type2 >= XML_REGEXP_MARK) &&
2232
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2233
((type2 >= XML_REGEXP_SEPAR) &&
2234
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
2235
((type2 >= XML_REGEXP_PUNCT) &&
2236
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2237
((type2 >= XML_REGEXP_SYMBOL) &&
2238
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2239
((type2 >= XML_REGEXP_OTHER) &&
2240
(type2 <= XML_REGEXP_OTHER_NA))
2241
) return(0);
2242
break;
2243
case XML_REGEXP_NOTINITNAME: /* \L */
2244
break;
2245
case XML_REGEXP_NAMECHAR: /* \c */
2246
/* can't be a mark, separator, punctuation, symbol or other */
2247
if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2248
((type2 >= XML_REGEXP_MARK) &&
2249
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2250
((type2 >= XML_REGEXP_PUNCT) &&
2251
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2252
((type2 >= XML_REGEXP_SEPAR) &&
2253
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
2254
((type2 >= XML_REGEXP_SYMBOL) &&
2255
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2256
((type2 >= XML_REGEXP_OTHER) &&
2257
(type2 <= XML_REGEXP_OTHER_NA))
2258
) return(0);
2259
break;
2260
case XML_REGEXP_NOTNAMECHAR: /* \C */
2261
break;
2262
case XML_REGEXP_DECIMAL: /* \d */
2263
/* can't be a letter, mark, separator, punctuation, symbol or other */
2264
if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2265
(type2 == XML_REGEXP_REALCHAR) ||
2266
((type2 >= XML_REGEXP_LETTER) &&
2267
(type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2268
((type2 >= XML_REGEXP_MARK) &&
2269
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2270
((type2 >= XML_REGEXP_PUNCT) &&
2271
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2272
((type2 >= XML_REGEXP_SEPAR) &&
2273
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
2274
((type2 >= XML_REGEXP_SYMBOL) &&
2275
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2276
((type2 >= XML_REGEXP_OTHER) &&
2277
(type2 <= XML_REGEXP_OTHER_NA))
2278
)return(0);
2279
break;
2280
case XML_REGEXP_NOTDECIMAL: /* \D */
2281
break;
2282
case XML_REGEXP_REALCHAR: /* \w */
2283
/* can't be a mark, separator, punctuation, symbol or other */
2284
if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2285
((type2 >= XML_REGEXP_MARK) &&
2286
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2287
((type2 >= XML_REGEXP_PUNCT) &&
2288
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2289
((type2 >= XML_REGEXP_SEPAR) &&
2290
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
2291
((type2 >= XML_REGEXP_SYMBOL) &&
2292
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2293
((type2 >= XML_REGEXP_OTHER) &&
2294
(type2 <= XML_REGEXP_OTHER_NA))
2295
)return(0);
2296
break;
2297
case XML_REGEXP_NOTREALCHAR: /* \W */
2298
break;
2299
/*
2300
* at that point we know both type 1 and type2 are from
2301
* character categories are ordered and are different,
2302
* it becomes simple because this is a partition
2303
*/
2304
case XML_REGEXP_LETTER:
2305
if (type2 <= XML_REGEXP_LETTER_OTHERS)
2306
return(1);
2307
return(0);
2308
case XML_REGEXP_LETTER_UPPERCASE:
2309
case XML_REGEXP_LETTER_LOWERCASE:
2310
case XML_REGEXP_LETTER_TITLECASE:
2311
case XML_REGEXP_LETTER_MODIFIER:
2312
case XML_REGEXP_LETTER_OTHERS:
2313
return(0);
2314
case XML_REGEXP_MARK:
2315
if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2316
return(1);
2317
return(0);
2318
case XML_REGEXP_MARK_NONSPACING:
2319
case XML_REGEXP_MARK_SPACECOMBINING:
2320
case XML_REGEXP_MARK_ENCLOSING:
2321
return(0);
2322
case XML_REGEXP_NUMBER:
2323
if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2324
return(1);
2325
return(0);
2326
case XML_REGEXP_NUMBER_DECIMAL:
2327
case XML_REGEXP_NUMBER_LETTER:
2328
case XML_REGEXP_NUMBER_OTHERS:
2329
return(0);
2330
case XML_REGEXP_PUNCT:
2331
if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2332
return(1);
2333
return(0);
2334
case XML_REGEXP_PUNCT_CONNECTOR:
2335
case XML_REGEXP_PUNCT_DASH:
2336
case XML_REGEXP_PUNCT_OPEN:
2337
case XML_REGEXP_PUNCT_CLOSE:
2338
case XML_REGEXP_PUNCT_INITQUOTE:
2339
case XML_REGEXP_PUNCT_FINQUOTE:
2340
case XML_REGEXP_PUNCT_OTHERS:
2341
return(0);
2342
case XML_REGEXP_SEPAR:
2343
if (type2 <= XML_REGEXP_SEPAR_PARA)
2344
return(1);
2345
return(0);
2346
case XML_REGEXP_SEPAR_SPACE:
2347
case XML_REGEXP_SEPAR_LINE:
2348
case XML_REGEXP_SEPAR_PARA:
2349
return(0);
2350
case XML_REGEXP_SYMBOL:
2351
if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2352
return(1);
2353
return(0);
2354
case XML_REGEXP_SYMBOL_MATH:
2355
case XML_REGEXP_SYMBOL_CURRENCY:
2356
case XML_REGEXP_SYMBOL_MODIFIER:
2357
case XML_REGEXP_SYMBOL_OTHERS:
2358
return(0);
2359
case XML_REGEXP_OTHER:
2360
if (type2 <= XML_REGEXP_OTHER_NA)
2361
return(1);
2362
return(0);
2363
case XML_REGEXP_OTHER_CONTROL:
2364
case XML_REGEXP_OTHER_FORMAT:
2365
case XML_REGEXP_OTHER_PRIVATE:
2366
case XML_REGEXP_OTHER_NA:
2367
return(0);
2368
default:
2369
break;
2370
}
2371
return(1);
2372
}
2373
2374
/**
2375
* xmlFAEqualAtoms:
2376
* @atom1: an atom
2377
* @atom2: an atom
2378
* @deep: if not set only compare string pointers
2379
*
2380
* Compares two atoms to check whether they are the same exactly
2381
* this is used to remove equivalent transitions
2382
*
2383
* Returns 1 if same and 0 otherwise
2384
*/
2385
static int
2386
xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2387
int ret = 0;
2388
2389
if (atom1 == atom2)
2390
return(1);
2391
if ((atom1 == NULL) || (atom2 == NULL))
2392
return(0);
2393
2394
if (atom1->type != atom2->type)
2395
return(0);
2396
switch (atom1->type) {
2397
case XML_REGEXP_EPSILON:
2398
ret = 0;
2399
break;
2400
case XML_REGEXP_STRING:
2401
if (!deep)
2402
ret = (atom1->valuep == atom2->valuep);
2403
else
2404
ret = xmlStrEqual((xmlChar *)atom1->valuep,
2405
(xmlChar *)atom2->valuep);
2406
break;
2407
case XML_REGEXP_CHARVAL:
2408
ret = (atom1->codepoint == atom2->codepoint);
2409
break;
2410
case XML_REGEXP_RANGES:
2411
/* too hard to do in the general case */
2412
ret = 0;
2413
default:
2414
break;
2415
}
2416
return(ret);
2417
}
2418
2419
/**
2420
* xmlFACompareAtoms:
2421
* @atom1: an atom
2422
* @atom2: an atom
2423
* @deep: if not set only compare string pointers
2424
*
2425
* Compares two atoms to check whether they intersect in some ways,
2426
* this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2427
*
2428
* Returns 1 if yes and 0 otherwise
2429
*/
2430
static int
2431
xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2432
int ret = 1;
2433
2434
if (atom1 == atom2)
2435
return(1);
2436
if ((atom1 == NULL) || (atom2 == NULL))
2437
return(0);
2438
2439
if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2440
(atom2->type == XML_REGEXP_ANYCHAR))
2441
return(1);
2442
2443
if (atom1->type > atom2->type) {
2444
xmlRegAtomPtr tmp;
2445
tmp = atom1;
2446
atom1 = atom2;
2447
atom2 = tmp;
2448
}
2449
if (atom1->type != atom2->type) {
2450
ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2451
/* if they can't intersect at the type level break now */
2452
if (ret == 0)
2453
return(0);
2454
}
2455
switch (atom1->type) {
2456
case XML_REGEXP_STRING:
2457
if (!deep)
2458
ret = (atom1->valuep != atom2->valuep);
2459
else {
2460
xmlChar *val1 = (xmlChar *)atom1->valuep;
2461
xmlChar *val2 = (xmlChar *)atom2->valuep;
2462
int compound1 = (xmlStrchr(val1, '|') != NULL);
2463
int compound2 = (xmlStrchr(val2, '|') != NULL);
2464
2465
/* Ignore negative match flag for ##other namespaces */
2466
if (compound1 != compound2)
2467
return(0);
2468
2469
ret = xmlRegStrEqualWildcard(val1, val2);
2470
}
2471
break;
2472
case XML_REGEXP_EPSILON:
2473
goto not_determinist;
2474
case XML_REGEXP_CHARVAL:
2475
if (atom2->type == XML_REGEXP_CHARVAL) {
2476
ret = (atom1->codepoint == atom2->codepoint);
2477
} else {
2478
ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2479
if (ret < 0)
2480
ret = 1;
2481
}
2482
break;
2483
case XML_REGEXP_RANGES:
2484
if (atom2->type == XML_REGEXP_RANGES) {
2485
int i, j, res;
2486
xmlRegRangePtr r1, r2;
2487
2488
/*
2489
* need to check that none of the ranges eventually matches
2490
*/
2491
for (i = 0;i < atom1->nbRanges;i++) {
2492
for (j = 0;j < atom2->nbRanges;j++) {
2493
r1 = atom1->ranges[i];
2494
r2 = atom2->ranges[j];
2495
res = xmlFACompareRanges(r1, r2);
2496
if (res == 1) {
2497
ret = 1;
2498
goto done;
2499
}
2500
}
2501
}
2502
ret = 0;
2503
}
2504
break;
2505
default:
2506
goto not_determinist;
2507
}
2508
done:
2509
if (atom1->neg != atom2->neg) {
2510
ret = !ret;
2511
}
2512
if (ret == 0)
2513
return(0);
2514
not_determinist:
2515
return(1);
2516
}
2517
2518
/**
2519
* xmlFARecurseDeterminism:
2520
* @ctxt: a regexp parser context
2521
*
2522
* Check whether the associated regexp is determinist,
2523
* should be called after xmlFAEliminateEpsilonTransitions()
2524
*
2525
*/
2526
static int
2527
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2528
int fromnr, int tonr, xmlRegAtomPtr atom) {
2529
int ret = 1;
2530
int res;
2531
int transnr, nbTrans;
2532
xmlRegTransPtr t1;
2533
int deep = 1;
2534
2535
if (state == NULL)
2536
return(ret);
2537
if (state->markd == XML_REGEXP_MARK_VISITED)
2538
return(ret);
2539
2540
if (ctxt->flags & AM_AUTOMATA_RNG)
2541
deep = 0;
2542
2543
/*
2544
* don't recurse on transitions potentially added in the course of
2545
* the elimination.
2546
*/
2547
nbTrans = state->nbTrans;
2548
for (transnr = 0;transnr < nbTrans;transnr++) {
2549
t1 = &(state->trans[transnr]);
2550
/*
2551
* check transitions conflicting with the one looked at
2552
*/
2553
if ((t1->to < 0) || (t1->to == fromnr))
2554
continue;
2555
if (t1->atom == NULL) {
2556
state->markd = XML_REGEXP_MARK_VISITED;
2557
res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2558
fromnr, tonr, atom);
2559
if (res == 0) {
2560
ret = 0;
2561
/* t1->nd = 1; */
2562
}
2563
continue;
2564
}
2565
if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2566
/* Treat equal transitions as deterministic. */
2567
if ((t1->to != tonr) ||
2568
(!xmlFAEqualAtoms(t1->atom, atom, deep)))
2569
ret = 0;
2570
/* mark the transition as non-deterministic */
2571
t1->nd = 1;
2572
}
2573
}
2574
return(ret);
2575
}
2576
2577
/**
2578
* xmlFAFinishRecurseDeterminism:
2579
* @ctxt: a regexp parser context
2580
*
2581
* Reset flags after checking determinism.
2582
*/
2583
static void
2584
xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2585
int transnr, nbTrans;
2586
2587
if (state == NULL)
2588
return;
2589
if (state->markd != XML_REGEXP_MARK_VISITED)
2590
return;
2591
state->markd = 0;
2592
2593
nbTrans = state->nbTrans;
2594
for (transnr = 0; transnr < nbTrans; transnr++) {
2595
xmlRegTransPtr t1 = &state->trans[transnr];
2596
if ((t1->atom == NULL) && (t1->to >= 0))
2597
xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2598
}
2599
}
2600
2601
/**
2602
* xmlFAComputesDeterminism:
2603
* @ctxt: a regexp parser context
2604
*
2605
* Check whether the associated regexp is determinist,
2606
* should be called after xmlFAEliminateEpsilonTransitions()
2607
*
2608
*/
2609
static int
2610
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2611
int statenr, transnr;
2612
xmlRegStatePtr state;
2613
xmlRegTransPtr t1, t2, last;
2614
int i;
2615
int ret = 1;
2616
int deep = 1;
2617
2618
if (ctxt->determinist != -1)
2619
return(ctxt->determinist);
2620
2621
if (ctxt->flags & AM_AUTOMATA_RNG)
2622
deep = 0;
2623
2624
/*
2625
* First cleanup the automata removing cancelled transitions
2626
*/
2627
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2628
state = ctxt->states[statenr];
2629
if (state == NULL)
2630
continue;
2631
if (state->nbTrans < 2)
2632
continue;
2633
for (transnr = 0;transnr < state->nbTrans;transnr++) {
2634
t1 = &(state->trans[transnr]);
2635
/*
2636
* Determinism checks in case of counted or all transitions
2637
* will have to be handled separately
2638
*/
2639
if (t1->atom == NULL) {
2640
/* t1->nd = 1; */
2641
continue;
2642
}
2643
if (t1->to < 0) /* eliminated */
2644
continue;
2645
for (i = 0;i < transnr;i++) {
2646
t2 = &(state->trans[i]);
2647
if (t2->to < 0) /* eliminated */
2648
continue;
2649
if (t2->atom != NULL) {
2650
if (t1->to == t2->to) {
2651
/*
2652
* Here we use deep because we want to keep the
2653
* transitions which indicate a conflict
2654
*/
2655
if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2656
(t1->counter == t2->counter) &&
2657
(t1->count == t2->count))
2658
t2->to = -1; /* eliminated */
2659
}
2660
}
2661
}
2662
}
2663
}
2664
2665
/*
2666
* Check for all states that there aren't 2 transitions
2667
* with the same atom and a different target.
2668
*/
2669
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2670
state = ctxt->states[statenr];
2671
if (state == NULL)
2672
continue;
2673
if (state->nbTrans < 2)
2674
continue;
2675
last = NULL;
2676
for (transnr = 0;transnr < state->nbTrans;transnr++) {
2677
t1 = &(state->trans[transnr]);
2678
/*
2679
* Determinism checks in case of counted or all transitions
2680
* will have to be handled separately
2681
*/
2682
if (t1->atom == NULL) {
2683
continue;
2684
}
2685
if (t1->to < 0) /* eliminated */
2686
continue;
2687
for (i = 0;i < transnr;i++) {
2688
t2 = &(state->trans[i]);
2689
if (t2->to < 0) /* eliminated */
2690
continue;
2691
if (t2->atom != NULL) {
2692
/*
2693
* But here we don't use deep because we want to
2694
* find transitions which indicate a conflict
2695
*/
2696
if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2697
/*
2698
* Treat equal counter transitions that couldn't be
2699
* eliminated as deterministic.
2700
*/
2701
if ((t1->to != t2->to) ||
2702
(t1->counter == t2->counter) ||
2703
(!xmlFAEqualAtoms(t1->atom, t2->atom, deep)))
2704
ret = 0;
2705
/* mark the transitions as non-deterministic ones */
2706
t1->nd = 1;
2707
t2->nd = 1;
2708
last = t1;
2709
}
2710
} else {
2711
int res;
2712
2713
/*
2714
* do the closure in case of remaining specific
2715
* epsilon transitions like choices or all
2716
*/
2717
res = xmlFARecurseDeterminism(ctxt, ctxt->states[t2->to],
2718
statenr, t1->to, t1->atom);
2719
xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t2->to]);
2720
/* don't shortcut the computation so all non deterministic
2721
transition get marked down
2722
if (ret == 0)
2723
return(0);
2724
*/
2725
if (res == 0) {
2726
t1->nd = 1;
2727
/* t2->nd = 1; */
2728
last = t1;
2729
ret = 0;
2730
}
2731
}
2732
}
2733
/* don't shortcut the computation so all non deterministic
2734
transition get marked down
2735
if (ret == 0)
2736
break; */
2737
}
2738
2739
/*
2740
* mark specifically the last non-deterministic transition
2741
* from a state since there is no need to set-up rollback
2742
* from it
2743
*/
2744
if (last != NULL) {
2745
last->nd = 2;
2746
}
2747
2748
/* don't shortcut the computation so all non deterministic
2749
transition get marked down
2750
if (ret == 0)
2751
break; */
2752
}
2753
2754
ctxt->determinist = ret;
2755
return(ret);
2756
}
2757
2758
/************************************************************************
2759
* *
2760
* Routines to check input against transition atoms *
2761
* *
2762
************************************************************************/
2763
2764
static int
2765
xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2766
int start, int end, const xmlChar *blockName) {
2767
int ret = 0;
2768
2769
switch (type) {
2770
case XML_REGEXP_STRING:
2771
case XML_REGEXP_SUBREG:
2772
case XML_REGEXP_RANGES:
2773
case XML_REGEXP_EPSILON:
2774
return(-1);
2775
case XML_REGEXP_ANYCHAR:
2776
ret = ((codepoint != '\n') && (codepoint != '\r'));
2777
break;
2778
case XML_REGEXP_CHARVAL:
2779
ret = ((codepoint >= start) && (codepoint <= end));
2780
break;
2781
case XML_REGEXP_NOTSPACE:
2782
neg = !neg;
2783
/* Falls through. */
2784
case XML_REGEXP_ANYSPACE:
2785
ret = ((codepoint == '\n') || (codepoint == '\r') ||
2786
(codepoint == '\t') || (codepoint == ' '));
2787
break;
2788
case XML_REGEXP_NOTINITNAME:
2789
neg = !neg;
2790
/* Falls through. */
2791
case XML_REGEXP_INITNAME:
2792
ret = (IS_LETTER(codepoint) ||
2793
(codepoint == '_') || (codepoint == ':'));
2794
break;
2795
case XML_REGEXP_NOTNAMECHAR:
2796
neg = !neg;
2797
/* Falls through. */
2798
case XML_REGEXP_NAMECHAR:
2799
ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2800
(codepoint == '.') || (codepoint == '-') ||
2801
(codepoint == '_') || (codepoint == ':') ||
2802
IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2803
break;
2804
case XML_REGEXP_NOTDECIMAL:
2805
neg = !neg;
2806
/* Falls through. */
2807
case XML_REGEXP_DECIMAL:
2808
ret = xmlUCSIsCatNd(codepoint);
2809
break;
2810
case XML_REGEXP_REALCHAR:
2811
neg = !neg;
2812
/* Falls through. */
2813
case XML_REGEXP_NOTREALCHAR:
2814
ret = xmlUCSIsCatP(codepoint);
2815
if (ret == 0)
2816
ret = xmlUCSIsCatZ(codepoint);
2817
if (ret == 0)
2818
ret = xmlUCSIsCatC(codepoint);
2819
break;
2820
case XML_REGEXP_LETTER:
2821
ret = xmlUCSIsCatL(codepoint);
2822
break;
2823
case XML_REGEXP_LETTER_UPPERCASE:
2824
ret = xmlUCSIsCatLu(codepoint);
2825
break;
2826
case XML_REGEXP_LETTER_LOWERCASE:
2827
ret = xmlUCSIsCatLl(codepoint);
2828
break;
2829
case XML_REGEXP_LETTER_TITLECASE:
2830
ret = xmlUCSIsCatLt(codepoint);
2831
break;
2832
case XML_REGEXP_LETTER_MODIFIER:
2833
ret = xmlUCSIsCatLm(codepoint);
2834
break;
2835
case XML_REGEXP_LETTER_OTHERS:
2836
ret = xmlUCSIsCatLo(codepoint);
2837
break;
2838
case XML_REGEXP_MARK:
2839
ret = xmlUCSIsCatM(codepoint);
2840
break;
2841
case XML_REGEXP_MARK_NONSPACING:
2842
ret = xmlUCSIsCatMn(codepoint);
2843
break;
2844
case XML_REGEXP_MARK_SPACECOMBINING:
2845
ret = xmlUCSIsCatMc(codepoint);
2846
break;
2847
case XML_REGEXP_MARK_ENCLOSING:
2848
ret = xmlUCSIsCatMe(codepoint);
2849
break;
2850
case XML_REGEXP_NUMBER:
2851
ret = xmlUCSIsCatN(codepoint);
2852
break;
2853
case XML_REGEXP_NUMBER_DECIMAL:
2854
ret = xmlUCSIsCatNd(codepoint);
2855
break;
2856
case XML_REGEXP_NUMBER_LETTER:
2857
ret = xmlUCSIsCatNl(codepoint);
2858
break;
2859
case XML_REGEXP_NUMBER_OTHERS:
2860
ret = xmlUCSIsCatNo(codepoint);
2861
break;
2862
case XML_REGEXP_PUNCT:
2863
ret = xmlUCSIsCatP(codepoint);
2864
break;
2865
case XML_REGEXP_PUNCT_CONNECTOR:
2866
ret = xmlUCSIsCatPc(codepoint);
2867
break;
2868
case XML_REGEXP_PUNCT_DASH:
2869
ret = xmlUCSIsCatPd(codepoint);
2870
break;
2871
case XML_REGEXP_PUNCT_OPEN:
2872
ret = xmlUCSIsCatPs(codepoint);
2873
break;
2874
case XML_REGEXP_PUNCT_CLOSE:
2875
ret = xmlUCSIsCatPe(codepoint);
2876
break;
2877
case XML_REGEXP_PUNCT_INITQUOTE:
2878
ret = xmlUCSIsCatPi(codepoint);
2879
break;
2880
case XML_REGEXP_PUNCT_FINQUOTE:
2881
ret = xmlUCSIsCatPf(codepoint);
2882
break;
2883
case XML_REGEXP_PUNCT_OTHERS:
2884
ret = xmlUCSIsCatPo(codepoint);
2885
break;
2886
case XML_REGEXP_SEPAR:
2887
ret = xmlUCSIsCatZ(codepoint);
2888
break;
2889
case XML_REGEXP_SEPAR_SPACE:
2890
ret = xmlUCSIsCatZs(codepoint);
2891
break;
2892
case XML_REGEXP_SEPAR_LINE:
2893
ret = xmlUCSIsCatZl(codepoint);
2894
break;
2895
case XML_REGEXP_SEPAR_PARA:
2896
ret = xmlUCSIsCatZp(codepoint);
2897
break;
2898
case XML_REGEXP_SYMBOL:
2899
ret = xmlUCSIsCatS(codepoint);
2900
break;
2901
case XML_REGEXP_SYMBOL_MATH:
2902
ret = xmlUCSIsCatSm(codepoint);
2903
break;
2904
case XML_REGEXP_SYMBOL_CURRENCY:
2905
ret = xmlUCSIsCatSc(codepoint);
2906
break;
2907
case XML_REGEXP_SYMBOL_MODIFIER:
2908
ret = xmlUCSIsCatSk(codepoint);
2909
break;
2910
case XML_REGEXP_SYMBOL_OTHERS:
2911
ret = xmlUCSIsCatSo(codepoint);
2912
break;
2913
case XML_REGEXP_OTHER:
2914
ret = xmlUCSIsCatC(codepoint);
2915
break;
2916
case XML_REGEXP_OTHER_CONTROL:
2917
ret = xmlUCSIsCatCc(codepoint);
2918
break;
2919
case XML_REGEXP_OTHER_FORMAT:
2920
ret = xmlUCSIsCatCf(codepoint);
2921
break;
2922
case XML_REGEXP_OTHER_PRIVATE:
2923
ret = xmlUCSIsCatCo(codepoint);
2924
break;
2925
case XML_REGEXP_OTHER_NA:
2926
/* ret = xmlUCSIsCatCn(codepoint); */
2927
/* Seems it doesn't exist anymore in recent Unicode releases */
2928
ret = 0;
2929
break;
2930
case XML_REGEXP_BLOCK_NAME:
2931
ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2932
break;
2933
}
2934
if (neg)
2935
return(!ret);
2936
return(ret);
2937
}
2938
2939
static int
2940
xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2941
int i, ret = 0;
2942
xmlRegRangePtr range;
2943
2944
if ((atom == NULL) || (!IS_CHAR(codepoint)))
2945
return(-1);
2946
2947
switch (atom->type) {
2948
case XML_REGEXP_SUBREG:
2949
case XML_REGEXP_EPSILON:
2950
return(-1);
2951
case XML_REGEXP_CHARVAL:
2952
return(codepoint == atom->codepoint);
2953
case XML_REGEXP_RANGES: {
2954
int accept = 0;
2955
2956
for (i = 0;i < atom->nbRanges;i++) {
2957
range = atom->ranges[i];
2958
if (range->neg == 2) {
2959
ret = xmlRegCheckCharacterRange(range->type, codepoint,
2960
0, range->start, range->end,
2961
range->blockName);
2962
if (ret != 0)
2963
return(0); /* excluded char */
2964
} else if (range->neg) {
2965
ret = xmlRegCheckCharacterRange(range->type, codepoint,
2966
0, range->start, range->end,
2967
range->blockName);
2968
if (ret == 0)
2969
accept = 1;
2970
else
2971
return(0);
2972
} else {
2973
ret = xmlRegCheckCharacterRange(range->type, codepoint,
2974
0, range->start, range->end,
2975
range->blockName);
2976
if (ret != 0)
2977
accept = 1; /* might still be excluded */
2978
}
2979
}
2980
return(accept);
2981
}
2982
case XML_REGEXP_STRING:
2983
printf("TODO: XML_REGEXP_STRING\n");
2984
return(-1);
2985
case XML_REGEXP_ANYCHAR:
2986
case XML_REGEXP_ANYSPACE:
2987
case XML_REGEXP_NOTSPACE:
2988
case XML_REGEXP_INITNAME:
2989
case XML_REGEXP_NOTINITNAME:
2990
case XML_REGEXP_NAMECHAR:
2991
case XML_REGEXP_NOTNAMECHAR:
2992
case XML_REGEXP_DECIMAL:
2993
case XML_REGEXP_NOTDECIMAL:
2994
case XML_REGEXP_REALCHAR:
2995
case XML_REGEXP_NOTREALCHAR:
2996
case XML_REGEXP_LETTER:
2997
case XML_REGEXP_LETTER_UPPERCASE:
2998
case XML_REGEXP_LETTER_LOWERCASE:
2999
case XML_REGEXP_LETTER_TITLECASE:
3000
case XML_REGEXP_LETTER_MODIFIER:
3001
case XML_REGEXP_LETTER_OTHERS:
3002
case XML_REGEXP_MARK:
3003
case XML_REGEXP_MARK_NONSPACING:
3004
case XML_REGEXP_MARK_SPACECOMBINING:
3005
case XML_REGEXP_MARK_ENCLOSING:
3006
case XML_REGEXP_NUMBER:
3007
case XML_REGEXP_NUMBER_DECIMAL:
3008
case XML_REGEXP_NUMBER_LETTER:
3009
case XML_REGEXP_NUMBER_OTHERS:
3010
case XML_REGEXP_PUNCT:
3011
case XML_REGEXP_PUNCT_CONNECTOR:
3012
case XML_REGEXP_PUNCT_DASH:
3013
case XML_REGEXP_PUNCT_OPEN:
3014
case XML_REGEXP_PUNCT_CLOSE:
3015
case XML_REGEXP_PUNCT_INITQUOTE:
3016
case XML_REGEXP_PUNCT_FINQUOTE:
3017
case XML_REGEXP_PUNCT_OTHERS:
3018
case XML_REGEXP_SEPAR:
3019
case XML_REGEXP_SEPAR_SPACE:
3020
case XML_REGEXP_SEPAR_LINE:
3021
case XML_REGEXP_SEPAR_PARA:
3022
case XML_REGEXP_SYMBOL:
3023
case XML_REGEXP_SYMBOL_MATH:
3024
case XML_REGEXP_SYMBOL_CURRENCY:
3025
case XML_REGEXP_SYMBOL_MODIFIER:
3026
case XML_REGEXP_SYMBOL_OTHERS:
3027
case XML_REGEXP_OTHER:
3028
case XML_REGEXP_OTHER_CONTROL:
3029
case XML_REGEXP_OTHER_FORMAT:
3030
case XML_REGEXP_OTHER_PRIVATE:
3031
case XML_REGEXP_OTHER_NA:
3032
case XML_REGEXP_BLOCK_NAME:
3033
ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3034
(const xmlChar *)atom->valuep);
3035
if (atom->neg)
3036
ret = !ret;
3037
break;
3038
}
3039
return(ret);
3040
}
3041
3042
/************************************************************************
3043
* *
3044
* Saving and restoring state of an execution context *
3045
* *
3046
************************************************************************/
3047
3048
static void
3049
xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3050
#ifdef MAX_PUSH
3051
if (exec->nbPush > MAX_PUSH) {
3052
exec->status = XML_REGEXP_INTERNAL_LIMIT;
3053
return;
3054
}
3055
exec->nbPush++;
3056
#endif
3057
3058
if (exec->maxRollbacks == 0) {
3059
exec->maxRollbacks = 4;
3060
exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3061
sizeof(xmlRegExecRollback));
3062
if (exec->rollbacks == NULL) {
3063
xmlRegexpErrMemory(NULL, "saving regexp");
3064
exec->maxRollbacks = 0;
3065
exec->status = XML_REGEXP_OUT_OF_MEMORY;
3066
return;
3067
}
3068
memset(exec->rollbacks, 0,
3069
exec->maxRollbacks * sizeof(xmlRegExecRollback));
3070
} else if (exec->nbRollbacks >= exec->maxRollbacks) {
3071
xmlRegExecRollback *tmp;
3072
int len = exec->maxRollbacks;
3073
3074
exec->maxRollbacks *= 2;
3075
tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3076
exec->maxRollbacks * sizeof(xmlRegExecRollback));
3077
if (tmp == NULL) {
3078
xmlRegexpErrMemory(NULL, "saving regexp");
3079
exec->maxRollbacks /= 2;
3080
exec->status = XML_REGEXP_OUT_OF_MEMORY;
3081
return;
3082
}
3083
exec->rollbacks = tmp;
3084
tmp = &exec->rollbacks[len];
3085
memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3086
}
3087
exec->rollbacks[exec->nbRollbacks].state = exec->state;
3088
exec->rollbacks[exec->nbRollbacks].index = exec->index;
3089
exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3090
if (exec->comp->nbCounters > 0) {
3091
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3092
exec->rollbacks[exec->nbRollbacks].counts = (int *)
3093
xmlMalloc(exec->comp->nbCounters * sizeof(int));
3094
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3095
xmlRegexpErrMemory(NULL, "saving regexp");
3096
exec->status = XML_REGEXP_OUT_OF_MEMORY;
3097
return;
3098
}
3099
}
3100
memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3101
exec->comp->nbCounters * sizeof(int));
3102
}
3103
exec->nbRollbacks++;
3104
}
3105
3106
static void
3107
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3108
if (exec->status != XML_REGEXP_OK)
3109
return;
3110
if (exec->nbRollbacks <= 0) {
3111
exec->status = XML_REGEXP_NOT_FOUND;
3112
return;
3113
}
3114
exec->nbRollbacks--;
3115
exec->state = exec->rollbacks[exec->nbRollbacks].state;
3116
exec->index = exec->rollbacks[exec->nbRollbacks].index;
3117
exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3118
if (exec->comp->nbCounters > 0) {
3119
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3120
fprintf(stderr, "exec save: allocation failed");
3121
exec->status = XML_REGEXP_INTERNAL_ERROR;
3122
return;
3123
}
3124
if (exec->counts) {
3125
memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3126
exec->comp->nbCounters * sizeof(int));
3127
}
3128
}
3129
}
3130
3131
/************************************************************************
3132
* *
3133
* Verifier, running an input against a compiled regexp *
3134
* *
3135
************************************************************************/
3136
3137
static int
3138
xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3139
xmlRegExecCtxt execval;
3140
xmlRegExecCtxtPtr exec = &execval;
3141
int ret, codepoint = 0, len, deter;
3142
3143
exec->inputString = content;
3144
exec->index = 0;
3145
exec->nbPush = 0;
3146
exec->determinist = 1;
3147
exec->maxRollbacks = 0;
3148
exec->nbRollbacks = 0;
3149
exec->rollbacks = NULL;
3150
exec->status = XML_REGEXP_OK;
3151
exec->comp = comp;
3152
exec->state = comp->states[0];
3153
exec->transno = 0;
3154
exec->transcount = 0;
3155
exec->inputStack = NULL;
3156
exec->inputStackMax = 0;
3157
if (comp->nbCounters > 0) {
3158
exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3159
if (exec->counts == NULL) {
3160
xmlRegexpErrMemory(NULL, "running regexp");
3161
return(XML_REGEXP_OUT_OF_MEMORY);
3162
}
3163
memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3164
} else
3165
exec->counts = NULL;
3166
while ((exec->status == XML_REGEXP_OK) && (exec->state != NULL) &&
3167
((exec->inputString[exec->index] != 0) ||
3168
((exec->state != NULL) &&
3169
(exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3170
xmlRegTransPtr trans;
3171
xmlRegAtomPtr atom;
3172
3173
/*
3174
* If end of input on non-terminal state, rollback, however we may
3175
* still have epsilon like transition for counted transitions
3176
* on counters, in that case don't break too early. Additionally,
3177
* if we are working on a range like "AB{0,2}", where B is not present,
3178
* we don't want to break.
3179
*/
3180
len = 1;
3181
if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3182
/*
3183
* if there is a transition, we must check if
3184
* atom allows minOccurs of 0
3185
*/
3186
if (exec->transno < exec->state->nbTrans) {
3187
trans = &exec->state->trans[exec->transno];
3188
if (trans->to >=0) {
3189
atom = trans->atom;
3190
if (!((atom->min == 0) && (atom->max > 0)))
3191
goto rollback;
3192
}
3193
} else
3194
goto rollback;
3195
}
3196
3197
exec->transcount = 0;
3198
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3199
trans = &exec->state->trans[exec->transno];
3200
if (trans->to < 0)
3201
continue;
3202
atom = trans->atom;
3203
ret = 0;
3204
deter = 1;
3205
if (trans->count >= 0) {
3206
int count;
3207
xmlRegCounterPtr counter;
3208
3209
if (exec->counts == NULL) {
3210
exec->status = XML_REGEXP_INTERNAL_ERROR;
3211
goto error;
3212
}
3213
/*
3214
* A counted transition.
3215
*/
3216
3217
count = exec->counts[trans->count];
3218
counter = &exec->comp->counters[trans->count];
3219
ret = ((count >= counter->min) && (count <= counter->max));
3220
if ((ret) && (counter->min != counter->max))
3221
deter = 0;
3222
} else if (atom == NULL) {
3223
fprintf(stderr, "epsilon transition left at runtime\n");
3224
exec->status = XML_REGEXP_INTERNAL_ERROR;
3225
break;
3226
} else if (exec->inputString[exec->index] != 0) {
3227
len = 4;
3228
codepoint = xmlGetUTF8Char(&exec->inputString[exec->index],
3229
&len);
3230
if (codepoint < 0) {
3231
exec->status = XML_REGEXP_INVALID_UTF8;
3232
goto error;
3233
}
3234
ret = xmlRegCheckCharacter(atom, codepoint);
3235
if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3236
xmlRegStatePtr to = comp->states[trans->to];
3237
3238
/*
3239
* this is a multiple input sequence
3240
* If there is a counter associated increment it now.
3241
* do not increment if the counter is already over the
3242
* maximum limit in which case get to next transition
3243
*/
3244
if (trans->counter >= 0) {
3245
xmlRegCounterPtr counter;
3246
3247
if ((exec->counts == NULL) ||
3248
(exec->comp == NULL) ||
3249
(exec->comp->counters == NULL)) {
3250
exec->status = XML_REGEXP_INTERNAL_ERROR;
3251
goto error;
3252
}
3253
counter = &exec->comp->counters[trans->counter];
3254
if (exec->counts[trans->counter] >= counter->max)
3255
continue; /* for loop on transitions */
3256
}
3257
/* Save before incrementing */
3258
if (exec->state->nbTrans > exec->transno + 1) {
3259
xmlFARegExecSave(exec);
3260
if (exec->status != XML_REGEXP_OK)
3261
goto error;
3262
}
3263
if (trans->counter >= 0) {
3264
exec->counts[trans->counter]++;
3265
}
3266
exec->transcount = 1;
3267
do {
3268
/*
3269
* Try to progress as much as possible on the input
3270
*/
3271
if (exec->transcount == atom->max) {
3272
break;
3273
}
3274
exec->index += len;
3275
/*
3276
* End of input: stop here
3277
*/
3278
if (exec->inputString[exec->index] == 0) {
3279
exec->index -= len;
3280
break;
3281
}
3282
if (exec->transcount >= atom->min) {
3283
int transno = exec->transno;
3284
xmlRegStatePtr state = exec->state;
3285
3286
/*
3287
* The transition is acceptable save it
3288
*/
3289
exec->transno = -1; /* trick */
3290
exec->state = to;
3291
xmlFARegExecSave(exec);
3292
if (exec->status != XML_REGEXP_OK)
3293
goto error;
3294
exec->transno = transno;
3295
exec->state = state;
3296
}
3297
len = 4;
3298
codepoint = xmlGetUTF8Char(
3299
&exec->inputString[exec->index], &len);
3300
if (codepoint < 0) {
3301
exec->status = XML_REGEXP_INVALID_UTF8;
3302
goto error;
3303
}
3304
ret = xmlRegCheckCharacter(atom, codepoint);
3305
exec->transcount++;
3306
} while (ret == 1);
3307
if (exec->transcount < atom->min)
3308
ret = 0;
3309
3310
/*
3311
* If the last check failed but one transition was found
3312
* possible, rollback
3313
*/
3314
if (ret < 0)
3315
ret = 0;
3316
if (ret == 0) {
3317
goto rollback;
3318
}
3319
if (trans->counter >= 0) {
3320
if (exec->counts == NULL) {
3321
exec->status = XML_REGEXP_INTERNAL_ERROR;
3322
goto error;
3323
}
3324
exec->counts[trans->counter]--;
3325
}
3326
} else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3327
/*
3328
* we don't match on the codepoint, but minOccurs of 0
3329
* says that's ok. Setting len to 0 inhibits stepping
3330
* over the codepoint.
3331
*/
3332
exec->transcount = 1;
3333
len = 0;
3334
ret = 1;
3335
}
3336
} else if ((atom->min == 0) && (atom->max > 0)) {
3337
/* another spot to match when minOccurs is 0 */
3338
exec->transcount = 1;
3339
len = 0;
3340
ret = 1;
3341
}
3342
if (ret == 1) {
3343
if ((trans->nd == 1) ||
3344
((trans->count >= 0) && (deter == 0) &&
3345
(exec->state->nbTrans > exec->transno + 1))) {
3346
xmlFARegExecSave(exec);
3347
if (exec->status != XML_REGEXP_OK)
3348
goto error;
3349
}
3350
if (trans->counter >= 0) {
3351
xmlRegCounterPtr counter;
3352
3353
/* make sure we don't go over the counter maximum value */
3354
if ((exec->counts == NULL) ||
3355
(exec->comp == NULL) ||
3356
(exec->comp->counters == NULL)) {
3357
exec->status = XML_REGEXP_INTERNAL_ERROR;
3358
goto error;
3359
}
3360
counter = &exec->comp->counters[trans->counter];
3361
if (exec->counts[trans->counter] >= counter->max)
3362
continue; /* for loop on transitions */
3363
exec->counts[trans->counter]++;
3364
}
3365
if ((trans->count >= 0) &&
3366
(trans->count < REGEXP_ALL_COUNTER)) {
3367
if (exec->counts == NULL) {
3368
exec->status = XML_REGEXP_INTERNAL_ERROR;
3369
goto error;
3370
}
3371
exec->counts[trans->count] = 0;
3372
}
3373
exec->state = comp->states[trans->to];
3374
exec->transno = 0;
3375
if (trans->atom != NULL) {
3376
exec->index += len;
3377
}
3378
goto progress;
3379
} else if (ret < 0) {
3380
exec->status = XML_REGEXP_INTERNAL_ERROR;
3381
break;
3382
}
3383
}
3384
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3385
rollback:
3386
/*
3387
* Failed to find a way out
3388
*/
3389
exec->determinist = 0;
3390
xmlFARegExecRollBack(exec);
3391
}
3392
progress:
3393
continue;
3394
}
3395
error:
3396
if (exec->rollbacks != NULL) {
3397
if (exec->counts != NULL) {
3398
int i;
3399
3400
for (i = 0;i < exec->maxRollbacks;i++)
3401
if (exec->rollbacks[i].counts != NULL)
3402
xmlFree(exec->rollbacks[i].counts);
3403
}
3404
xmlFree(exec->rollbacks);
3405
}
3406
if (exec->state == NULL)
3407
return(XML_REGEXP_INTERNAL_ERROR);
3408
if (exec->counts != NULL)
3409
xmlFree(exec->counts);
3410
if (exec->status == XML_REGEXP_OK)
3411
return(1);
3412
if (exec->status == XML_REGEXP_NOT_FOUND)
3413
return(0);
3414
return(exec->status);
3415
}
3416
3417
/************************************************************************
3418
* *
3419
* Progressive interface to the verifier one atom at a time *
3420
* *
3421
************************************************************************/
3422
3423
/**
3424
* xmlRegNewExecCtxt:
3425
* @comp: a precompiled regular expression
3426
* @callback: a callback function used for handling progresses in the
3427
* automata matching phase
3428
* @data: the context data associated to the callback in this context
3429
*
3430
* Build a context used for progressive evaluation of a regexp.
3431
*
3432
* Returns the new context
3433
*/
3434
xmlRegExecCtxtPtr
3435
xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3436
xmlRegExecCtxtPtr exec;
3437
3438
if (comp == NULL)
3439
return(NULL);
3440
if ((comp->compact == NULL) && (comp->states == NULL))
3441
return(NULL);
3442
exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3443
if (exec == NULL) {
3444
xmlRegexpErrMemory(NULL, "creating execution context");
3445
return(NULL);
3446
}
3447
memset(exec, 0, sizeof(xmlRegExecCtxt));
3448
exec->inputString = NULL;
3449
exec->index = 0;
3450
exec->determinist = 1;
3451
exec->maxRollbacks = 0;
3452
exec->nbRollbacks = 0;
3453
exec->rollbacks = NULL;
3454
exec->status = XML_REGEXP_OK;
3455
exec->comp = comp;
3456
if (comp->compact == NULL)
3457
exec->state = comp->states[0];
3458
exec->transno = 0;
3459
exec->transcount = 0;
3460
exec->callback = callback;
3461
exec->data = data;
3462
if (comp->nbCounters > 0) {
3463
/*
3464
* For error handling, exec->counts is allocated twice the size
3465
* the second half is used to store the data in case of rollback
3466
*/
3467
exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3468
* 2);
3469
if (exec->counts == NULL) {
3470
xmlRegexpErrMemory(NULL, "creating execution context");
3471
xmlFree(exec);
3472
return(NULL);
3473
}
3474
memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3475
exec->errCounts = &exec->counts[comp->nbCounters];
3476
} else {
3477
exec->counts = NULL;
3478
exec->errCounts = NULL;
3479
}
3480
exec->inputStackMax = 0;
3481
exec->inputStackNr = 0;
3482
exec->inputStack = NULL;
3483
exec->errStateNo = -1;
3484
exec->errString = NULL;
3485
exec->nbPush = 0;
3486
return(exec);
3487
}
3488
3489
/**
3490
* xmlRegFreeExecCtxt:
3491
* @exec: a regular expression evaluation context
3492
*
3493
* Free the structures associated to a regular expression evaluation context.
3494
*/
3495
void
3496
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3497
if (exec == NULL)
3498
return;
3499
3500
if (exec->rollbacks != NULL) {
3501
if (exec->counts != NULL) {
3502
int i;
3503
3504
for (i = 0;i < exec->maxRollbacks;i++)
3505
if (exec->rollbacks[i].counts != NULL)
3506
xmlFree(exec->rollbacks[i].counts);
3507
}
3508
xmlFree(exec->rollbacks);
3509
}
3510
if (exec->counts != NULL)
3511
xmlFree(exec->counts);
3512
if (exec->inputStack != NULL) {
3513
int i;
3514
3515
for (i = 0;i < exec->inputStackNr;i++) {
3516
if (exec->inputStack[i].value != NULL)
3517
xmlFree(exec->inputStack[i].value);
3518
}
3519
xmlFree(exec->inputStack);
3520
}
3521
if (exec->errString != NULL)
3522
xmlFree(exec->errString);
3523
xmlFree(exec);
3524
}
3525
3526
static void
3527
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3528
void *data) {
3529
if (exec->inputStackMax == 0) {
3530
exec->inputStackMax = 4;
3531
exec->inputStack = (xmlRegInputTokenPtr)
3532
xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3533
if (exec->inputStack == NULL) {
3534
xmlRegexpErrMemory(NULL, "pushing input string");
3535
exec->inputStackMax = 0;
3536
return;
3537
}
3538
} else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3539
xmlRegInputTokenPtr tmp;
3540
3541
exec->inputStackMax *= 2;
3542
tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3543
exec->inputStackMax * sizeof(xmlRegInputToken));
3544
if (tmp == NULL) {
3545
xmlRegexpErrMemory(NULL, "pushing input string");
3546
exec->inputStackMax /= 2;
3547
return;
3548
}
3549
exec->inputStack = tmp;
3550
}
3551
exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3552
exec->inputStack[exec->inputStackNr].data = data;
3553
exec->inputStackNr++;
3554
exec->inputStack[exec->inputStackNr].value = NULL;
3555
exec->inputStack[exec->inputStackNr].data = NULL;
3556
}
3557
3558
/**
3559
* xmlRegStrEqualWildcard:
3560
* @expStr: the string to be evaluated
3561
* @valStr: the validation string
3562
*
3563
* Checks if both strings are equal or have the same content. "*"
3564
* can be used as a wildcard in @valStr; "|" is used as a separator of
3565
* substrings in both @expStr and @valStr.
3566
*
3567
* Returns 1 if the comparison is satisfied and the number of substrings
3568
* is equal, 0 otherwise.
3569
*/
3570
3571
static int
3572
xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3573
if (expStr == valStr) return(1);
3574
if (expStr == NULL) return(0);
3575
if (valStr == NULL) return(0);
3576
do {
3577
/*
3578
* Eval if we have a wildcard for the current item.
3579
*/
3580
if (*expStr != *valStr) {
3581
/* if one of them starts with a wildcard make valStr be it */
3582
if (*valStr == '*') {
3583
const xmlChar *tmp;
3584
3585
tmp = valStr;
3586
valStr = expStr;
3587
expStr = tmp;
3588
}
3589
if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3590
do {
3591
if (*valStr == XML_REG_STRING_SEPARATOR)
3592
break;
3593
valStr++;
3594
} while (*valStr != 0);
3595
continue;
3596
} else
3597
return(0);
3598
}
3599
expStr++;
3600
valStr++;
3601
} while (*valStr != 0);
3602
if (*expStr != 0)
3603
return (0);
3604
else
3605
return (1);
3606
}
3607
3608
/**
3609
* xmlRegCompactPushString:
3610
* @exec: a regexp execution context
3611
* @comp: the precompiled exec with a compact table
3612
* @value: a string token input
3613
* @data: data associated to the token to reuse in callbacks
3614
*
3615
* Push one input token in the execution context
3616
*
3617
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
3618
* a negative value in case of error.
3619
*/
3620
static int
3621
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3622
xmlRegexpPtr comp,
3623
const xmlChar *value,
3624
void *data) {
3625
int state = exec->index;
3626
int i, target;
3627
3628
if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3629
return(-1);
3630
3631
if (value == NULL) {
3632
/*
3633
* are we at a final state ?
3634
*/
3635
if (comp->compact[state * (comp->nbstrings + 1)] ==
3636
XML_REGEXP_FINAL_STATE)
3637
return(1);
3638
return(0);
3639
}
3640
3641
/*
3642
* Examine all outside transitions from current state
3643
*/
3644
for (i = 0;i < comp->nbstrings;i++) {
3645
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3646
if ((target > 0) && (target <= comp->nbstates)) {
3647
target--; /* to avoid 0 */
3648
if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3649
exec->index = target;
3650
if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3651
exec->callback(exec->data, value,
3652
comp->transdata[state * comp->nbstrings + i], data);
3653
}
3654
if (comp->compact[target * (comp->nbstrings + 1)] ==
3655
XML_REGEXP_SINK_STATE)
3656
goto error;
3657
3658
if (comp->compact[target * (comp->nbstrings + 1)] ==
3659
XML_REGEXP_FINAL_STATE)
3660
return(1);
3661
return(0);
3662
}
3663
}
3664
}
3665
/*
3666
* Failed to find an exit transition out from current state for the
3667
* current token
3668
*/
3669
error:
3670
if (exec->errString != NULL)
3671
xmlFree(exec->errString);
3672
exec->errString = xmlStrdup(value);
3673
exec->errStateNo = state;
3674
exec->status = XML_REGEXP_NOT_FOUND;
3675
return(XML_REGEXP_NOT_FOUND);
3676
}
3677
3678
/**
3679
* xmlRegExecPushStringInternal:
3680
* @exec: a regexp execution context or NULL to indicate the end
3681
* @value: a string token input
3682
* @data: data associated to the token to reuse in callbacks
3683
* @compound: value was assembled from 2 strings
3684
*
3685
* Push one input token in the execution context
3686
*
3687
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
3688
* a negative value in case of error.
3689
*/
3690
static int
3691
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3692
void *data, int compound) {
3693
xmlRegTransPtr trans;
3694
xmlRegAtomPtr atom;
3695
int ret;
3696
int final = 0;
3697
int progress = 1;
3698
3699
if (exec == NULL)
3700
return(-1);
3701
if (exec->comp == NULL)
3702
return(-1);
3703
if (exec->status != XML_REGEXP_OK)
3704
return(exec->status);
3705
3706
if (exec->comp->compact != NULL)
3707
return(xmlRegCompactPushString(exec, exec->comp, value, data));
3708
3709
if (value == NULL) {
3710
if (exec->state->type == XML_REGEXP_FINAL_STATE)
3711
return(1);
3712
final = 1;
3713
}
3714
3715
/*
3716
* If we have an active rollback stack push the new value there
3717
* and get back to where we were left
3718
*/
3719
if ((value != NULL) && (exec->inputStackNr > 0)) {
3720
xmlFARegExecSaveInputString(exec, value, data);
3721
value = exec->inputStack[exec->index].value;
3722
data = exec->inputStack[exec->index].data;
3723
}
3724
3725
while ((exec->status == XML_REGEXP_OK) &&
3726
((value != NULL) ||
3727
((final == 1) &&
3728
(exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3729
3730
/*
3731
* End of input on non-terminal state, rollback, however we may
3732
* still have epsilon like transition for counted transitions
3733
* on counters, in that case don't break too early.
3734
*/
3735
if ((value == NULL) && (exec->counts == NULL))
3736
goto rollback;
3737
3738
exec->transcount = 0;
3739
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3740
trans = &exec->state->trans[exec->transno];
3741
if (trans->to < 0)
3742
continue;
3743
atom = trans->atom;
3744
ret = 0;
3745
if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3746
int i;
3747
int count;
3748
xmlRegTransPtr t;
3749
xmlRegCounterPtr counter;
3750
3751
ret = 0;
3752
3753
/*
3754
* Check all counted transitions from the current state
3755
*/
3756
if ((value == NULL) && (final)) {
3757
ret = 1;
3758
} else if (value != NULL) {
3759
for (i = 0;i < exec->state->nbTrans;i++) {
3760
t = &exec->state->trans[i];
3761
if ((t->counter < 0) || (t == trans))
3762
continue;
3763
counter = &exec->comp->counters[t->counter];
3764
count = exec->counts[t->counter];
3765
if ((count < counter->max) &&
3766
(t->atom != NULL) &&
3767
(xmlStrEqual(value, t->atom->valuep))) {
3768
ret = 0;
3769
break;
3770
}
3771
if ((count >= counter->min) &&
3772
(count < counter->max) &&
3773
(t->atom != NULL) &&
3774
(xmlStrEqual(value, t->atom->valuep))) {
3775
ret = 1;
3776
break;
3777
}
3778
}
3779
}
3780
} else if (trans->count == REGEXP_ALL_COUNTER) {
3781
int i;
3782
int count;
3783
xmlRegTransPtr t;
3784
xmlRegCounterPtr counter;
3785
3786
ret = 1;
3787
3788
/*
3789
* Check all counted transitions from the current state
3790
*/
3791
for (i = 0;i < exec->state->nbTrans;i++) {
3792
t = &exec->state->trans[i];
3793
if ((t->counter < 0) || (t == trans))
3794
continue;
3795
counter = &exec->comp->counters[t->counter];
3796
count = exec->counts[t->counter];
3797
if ((count < counter->min) || (count > counter->max)) {
3798
ret = 0;
3799
break;
3800
}
3801
}
3802
} else if (trans->count >= 0) {
3803
int count;
3804
xmlRegCounterPtr counter;
3805
3806
/*
3807
* A counted transition.
3808
*/
3809
3810
count = exec->counts[trans->count];
3811
counter = &exec->comp->counters[trans->count];
3812
ret = ((count >= counter->min) && (count <= counter->max));
3813
} else if (atom == NULL) {
3814
fprintf(stderr, "epsilon transition left at runtime\n");
3815
exec->status = XML_REGEXP_INTERNAL_ERROR;
3816
break;
3817
} else if (value != NULL) {
3818
ret = xmlRegStrEqualWildcard(atom->valuep, value);
3819
if (atom->neg) {
3820
ret = !ret;
3821
if (!compound)
3822
ret = 0;
3823
}
3824
if ((ret == 1) && (trans->counter >= 0)) {
3825
xmlRegCounterPtr counter;
3826
int count;
3827
3828
count = exec->counts[trans->counter];
3829
counter = &exec->comp->counters[trans->counter];
3830
if (count >= counter->max)
3831
ret = 0;
3832
}
3833
3834
if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3835
xmlRegStatePtr to = exec->comp->states[trans->to];
3836
3837
/*
3838
* this is a multiple input sequence
3839
*/
3840
if (exec->state->nbTrans > exec->transno + 1) {
3841
if (exec->inputStackNr <= 0) {
3842
xmlFARegExecSaveInputString(exec, value, data);
3843
}
3844
xmlFARegExecSave(exec);
3845
}
3846
exec->transcount = 1;
3847
do {
3848
/*
3849
* Try to progress as much as possible on the input
3850
*/
3851
if (exec->transcount == atom->max) {
3852
break;
3853
}
3854
exec->index++;
3855
value = exec->inputStack[exec->index].value;
3856
data = exec->inputStack[exec->index].data;
3857
3858
/*
3859
* End of input: stop here
3860
*/
3861
if (value == NULL) {
3862
exec->index --;
3863
break;
3864
}
3865
if (exec->transcount >= atom->min) {
3866
int transno = exec->transno;
3867
xmlRegStatePtr state = exec->state;
3868
3869
/*
3870
* The transition is acceptable save it
3871
*/
3872
exec->transno = -1; /* trick */
3873
exec->state = to;
3874
if (exec->inputStackNr <= 0) {
3875
xmlFARegExecSaveInputString(exec, value, data);
3876
}
3877
xmlFARegExecSave(exec);
3878
exec->transno = transno;
3879
exec->state = state;
3880
}
3881
ret = xmlStrEqual(value, atom->valuep);
3882
exec->transcount++;
3883
} while (ret == 1);
3884
if (exec->transcount < atom->min)
3885
ret = 0;
3886
3887
/*
3888
* If the last check failed but one transition was found
3889
* possible, rollback
3890
*/
3891
if (ret < 0)
3892
ret = 0;
3893
if (ret == 0) {
3894
goto rollback;
3895
}
3896
}
3897
}
3898
if (ret == 1) {
3899
if ((exec->callback != NULL) && (atom != NULL) &&
3900
(data != NULL)) {
3901
exec->callback(exec->data, atom->valuep,
3902
atom->data, data);
3903
}
3904
if (exec->state->nbTrans > exec->transno + 1) {
3905
if (exec->inputStackNr <= 0) {
3906
xmlFARegExecSaveInputString(exec, value, data);
3907
}
3908
xmlFARegExecSave(exec);
3909
}
3910
if (trans->counter >= 0) {
3911
exec->counts[trans->counter]++;
3912
}
3913
if ((trans->count >= 0) &&
3914
(trans->count < REGEXP_ALL_COUNTER)) {
3915
exec->counts[trans->count] = 0;
3916
}
3917
if ((exec->comp->states[trans->to] != NULL) &&
3918
(exec->comp->states[trans->to]->type ==
3919
XML_REGEXP_SINK_STATE)) {
3920
/*
3921
* entering a sink state, save the current state as error
3922
* state.
3923
*/
3924
if (exec->errString != NULL)
3925
xmlFree(exec->errString);
3926
exec->errString = xmlStrdup(value);
3927
exec->errState = exec->state;
3928
memcpy(exec->errCounts, exec->counts,
3929
exec->comp->nbCounters * sizeof(int));
3930
}
3931
exec->state = exec->comp->states[trans->to];
3932
exec->transno = 0;
3933
if (trans->atom != NULL) {
3934
if (exec->inputStack != NULL) {
3935
exec->index++;
3936
if (exec->index < exec->inputStackNr) {
3937
value = exec->inputStack[exec->index].value;
3938
data = exec->inputStack[exec->index].data;
3939
} else {
3940
value = NULL;
3941
data = NULL;
3942
}
3943
} else {
3944
value = NULL;
3945
data = NULL;
3946
}
3947
}
3948
goto progress;
3949
} else if (ret < 0) {
3950
exec->status = XML_REGEXP_INTERNAL_ERROR;
3951
break;
3952
}
3953
}
3954
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3955
rollback:
3956
/*
3957
* if we didn't yet rollback on the current input
3958
* store the current state as the error state.
3959
*/
3960
if ((progress) && (exec->state != NULL) &&
3961
(exec->state->type != XML_REGEXP_SINK_STATE)) {
3962
progress = 0;
3963
if (exec->errString != NULL)
3964
xmlFree(exec->errString);
3965
exec->errString = xmlStrdup(value);
3966
exec->errState = exec->state;
3967
if (exec->comp->nbCounters)
3968
memcpy(exec->errCounts, exec->counts,
3969
exec->comp->nbCounters * sizeof(int));
3970
}
3971
3972
/*
3973
* Failed to find a way out
3974
*/
3975
exec->determinist = 0;
3976
xmlFARegExecRollBack(exec);
3977
if ((exec->inputStack != NULL ) &&
3978
(exec->status == XML_REGEXP_OK)) {
3979
value = exec->inputStack[exec->index].value;
3980
data = exec->inputStack[exec->index].data;
3981
}
3982
}
3983
continue;
3984
progress:
3985
progress = 1;
3986
continue;
3987
}
3988
if (exec->status == XML_REGEXP_OK) {
3989
return(exec->state->type == XML_REGEXP_FINAL_STATE);
3990
}
3991
return(exec->status);
3992
}
3993
3994
/**
3995
* xmlRegExecPushString:
3996
* @exec: a regexp execution context or NULL to indicate the end
3997
* @value: a string token input
3998
* @data: data associated to the token to reuse in callbacks
3999
*
4000
* Push one input token in the execution context
4001
*
4002
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
4003
* a negative value in case of error.
4004
*/
4005
int
4006
xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4007
void *data) {
4008
return(xmlRegExecPushStringInternal(exec, value, data, 0));
4009
}
4010
4011
/**
4012
* xmlRegExecPushString2:
4013
* @exec: a regexp execution context or NULL to indicate the end
4014
* @value: the first string token input
4015
* @value2: the second string token input
4016
* @data: data associated to the token to reuse in callbacks
4017
*
4018
* Push one input token in the execution context
4019
*
4020
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
4021
* a negative value in case of error.
4022
*/
4023
int
4024
xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4025
const xmlChar *value2, void *data) {
4026
xmlChar buf[150];
4027
int lenn, lenp, ret;
4028
xmlChar *str;
4029
4030
if (exec == NULL)
4031
return(-1);
4032
if (exec->comp == NULL)
4033
return(-1);
4034
if (exec->status != XML_REGEXP_OK)
4035
return(exec->status);
4036
4037
if (value2 == NULL)
4038
return(xmlRegExecPushString(exec, value, data));
4039
4040
lenn = strlen((char *) value2);
4041
lenp = strlen((char *) value);
4042
4043
if (150 < lenn + lenp + 2) {
4044
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4045
if (str == NULL) {
4046
exec->status = XML_REGEXP_OUT_OF_MEMORY;
4047
return(-1);
4048
}
4049
} else {
4050
str = buf;
4051
}
4052
memcpy(&str[0], value, lenp);
4053
str[lenp] = XML_REG_STRING_SEPARATOR;
4054
memcpy(&str[lenp + 1], value2, lenn);
4055
str[lenn + lenp + 1] = 0;
4056
4057
if (exec->comp->compact != NULL)
4058
ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4059
else
4060
ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4061
4062
if (str != buf)
4063
xmlFree(str);
4064
return(ret);
4065
}
4066
4067
/**
4068
* xmlRegExecGetValues:
4069
* @exec: a regexp execution context
4070
* @err: error extraction or normal one
4071
* @nbval: pointer to the number of accepted values IN/OUT
4072
* @nbneg: return number of negative transitions
4073
* @values: pointer to the array of acceptable values
4074
* @terminal: return value if this was a terminal state
4075
*
4076
* Extract information from the regexp execution, internal routine to
4077
* implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4078
*
4079
* Returns: 0 in case of success or -1 in case of error.
4080
*/
4081
static int
4082
xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4083
int *nbval, int *nbneg,
4084
xmlChar **values, int *terminal) {
4085
int maxval;
4086
int nb = 0;
4087
4088
if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4089
(values == NULL) || (*nbval <= 0))
4090
return(-1);
4091
4092
maxval = *nbval;
4093
*nbval = 0;
4094
*nbneg = 0;
4095
if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4096
xmlRegexpPtr comp;
4097
int target, i, state;
4098
4099
comp = exec->comp;
4100
4101
if (err) {
4102
if (exec->errStateNo == -1) return(-1);
4103
state = exec->errStateNo;
4104
} else {
4105
state = exec->index;
4106
}
4107
if (terminal != NULL) {
4108
if (comp->compact[state * (comp->nbstrings + 1)] ==
4109
XML_REGEXP_FINAL_STATE)
4110
*terminal = 1;
4111
else
4112
*terminal = 0;
4113
}
4114
for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4115
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4116
if ((target > 0) && (target <= comp->nbstates) &&
4117
(comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4118
XML_REGEXP_SINK_STATE)) {
4119
values[nb++] = comp->stringMap[i];
4120
(*nbval)++;
4121
}
4122
}
4123
for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4124
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4125
if ((target > 0) && (target <= comp->nbstates) &&
4126
(comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4127
XML_REGEXP_SINK_STATE)) {
4128
values[nb++] = comp->stringMap[i];
4129
(*nbneg)++;
4130
}
4131
}
4132
} else {
4133
int transno;
4134
xmlRegTransPtr trans;
4135
xmlRegAtomPtr atom;
4136
xmlRegStatePtr state;
4137
4138
if (terminal != NULL) {
4139
if (exec->state->type == XML_REGEXP_FINAL_STATE)
4140
*terminal = 1;
4141
else
4142
*terminal = 0;
4143
}
4144
4145
if (err) {
4146
if (exec->errState == NULL) return(-1);
4147
state = exec->errState;
4148
} else {
4149
if (exec->state == NULL) return(-1);
4150
state = exec->state;
4151
}
4152
for (transno = 0;
4153
(transno < state->nbTrans) && (nb < maxval);
4154
transno++) {
4155
trans = &state->trans[transno];
4156
if (trans->to < 0)
4157
continue;
4158
atom = trans->atom;
4159
if ((atom == NULL) || (atom->valuep == NULL))
4160
continue;
4161
if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4162
/* this should not be reached but ... */
4163
TODO;
4164
} else if (trans->count == REGEXP_ALL_COUNTER) {
4165
/* this should not be reached but ... */
4166
TODO;
4167
} else if (trans->counter >= 0) {
4168
xmlRegCounterPtr counter = NULL;
4169
int count;
4170
4171
if (err)
4172
count = exec->errCounts[trans->counter];
4173
else
4174
count = exec->counts[trans->counter];
4175
if (exec->comp != NULL)
4176
counter = &exec->comp->counters[trans->counter];
4177
if ((counter == NULL) || (count < counter->max)) {
4178
if (atom->neg)
4179
values[nb++] = (xmlChar *) atom->valuep2;
4180
else
4181
values[nb++] = (xmlChar *) atom->valuep;
4182
(*nbval)++;
4183
}
4184
} else {
4185
if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4186
(exec->comp->states[trans->to]->type !=
4187
XML_REGEXP_SINK_STATE)) {
4188
if (atom->neg)
4189
values[nb++] = (xmlChar *) atom->valuep2;
4190
else
4191
values[nb++] = (xmlChar *) atom->valuep;
4192
(*nbval)++;
4193
}
4194
}
4195
}
4196
for (transno = 0;
4197
(transno < state->nbTrans) && (nb < maxval);
4198
transno++) {
4199
trans = &state->trans[transno];
4200
if (trans->to < 0)
4201
continue;
4202
atom = trans->atom;
4203
if ((atom == NULL) || (atom->valuep == NULL))
4204
continue;
4205
if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4206
continue;
4207
} else if (trans->count == REGEXP_ALL_COUNTER) {
4208
continue;
4209
} else if (trans->counter >= 0) {
4210
continue;
4211
} else {
4212
if ((exec->comp->states[trans->to] != NULL) &&
4213
(exec->comp->states[trans->to]->type ==
4214
XML_REGEXP_SINK_STATE)) {
4215
if (atom->neg)
4216
values[nb++] = (xmlChar *) atom->valuep2;
4217
else
4218
values[nb++] = (xmlChar *) atom->valuep;
4219
(*nbneg)++;
4220
}
4221
}
4222
}
4223
}
4224
return(0);
4225
}
4226
4227
/**
4228
* xmlRegExecNextValues:
4229
* @exec: a regexp execution context
4230
* @nbval: pointer to the number of accepted values IN/OUT
4231
* @nbneg: return number of negative transitions
4232
* @values: pointer to the array of acceptable values
4233
* @terminal: return value if this was a terminal state
4234
*
4235
* Extract information from the regexp execution,
4236
* the parameter @values must point to an array of @nbval string pointers
4237
* on return nbval will contain the number of possible strings in that
4238
* state and the @values array will be updated with them. The string values
4239
* returned will be freed with the @exec context and don't need to be
4240
* deallocated.
4241
*
4242
* Returns: 0 in case of success or -1 in case of error.
4243
*/
4244
int
4245
xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4246
xmlChar **values, int *terminal) {
4247
return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4248
}
4249
4250
/**
4251
* xmlRegExecErrInfo:
4252
* @exec: a regexp execution context generating an error
4253
* @string: return value for the error string
4254
* @nbval: pointer to the number of accepted values IN/OUT
4255
* @nbneg: return number of negative transitions
4256
* @values: pointer to the array of acceptable values
4257
* @terminal: return value if this was a terminal state
4258
*
4259
* Extract error information from the regexp execution, the parameter
4260
* @string will be updated with the value pushed and not accepted,
4261
* the parameter @values must point to an array of @nbval string pointers
4262
* on return nbval will contain the number of possible strings in that
4263
* state and the @values array will be updated with them. The string values
4264
* returned will be freed with the @exec context and don't need to be
4265
* deallocated.
4266
*
4267
* Returns: 0 in case of success or -1 in case of error.
4268
*/
4269
int
4270
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4271
int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4272
if (exec == NULL)
4273
return(-1);
4274
if (string != NULL) {
4275
if (exec->status != XML_REGEXP_OK)
4276
*string = exec->errString;
4277
else
4278
*string = NULL;
4279
}
4280
return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4281
}
4282
4283
#if 0
4284
static int
4285
xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4286
xmlRegTransPtr trans;
4287
xmlRegAtomPtr atom;
4288
int ret;
4289
int codepoint, len;
4290
4291
if (exec == NULL)
4292
return(-1);
4293
if (exec->status != XML_REGEXP_OK)
4294
return(exec->status);
4295
4296
while ((exec->status == XML_REGEXP_OK) &&
4297
((exec->inputString[exec->index] != 0) ||
4298
(exec->state->type != XML_REGEXP_FINAL_STATE))) {
4299
4300
/*
4301
* End of input on non-terminal state, rollback, however we may
4302
* still have epsilon like transition for counted transitions
4303
* on counters, in that case don't break too early.
4304
*/
4305
if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4306
goto rollback;
4307
4308
exec->transcount = 0;
4309
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4310
trans = &exec->state->trans[exec->transno];
4311
if (trans->to < 0)
4312
continue;
4313
atom = trans->atom;
4314
ret = 0;
4315
if (trans->count >= 0) {
4316
int count;
4317
xmlRegCounterPtr counter;
4318
4319
/*
4320
* A counted transition.
4321
*/
4322
4323
count = exec->counts[trans->count];
4324
counter = &exec->comp->counters[trans->count];
4325
ret = ((count >= counter->min) && (count <= counter->max));
4326
} else if (atom == NULL) {
4327
fprintf(stderr, "epsilon transition left at runtime\n");
4328
exec->status = XML_REGEXP_INTERNAL_ERROR;
4329
break;
4330
} else if (exec->inputString[exec->index] != 0) {
4331
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4332
ret = xmlRegCheckCharacter(atom, codepoint);
4333
if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4334
xmlRegStatePtr to = exec->comp->states[trans->to];
4335
4336
/*
4337
* this is a multiple input sequence
4338
*/
4339
if (exec->state->nbTrans > exec->transno + 1) {
4340
xmlFARegExecSave(exec);
4341
}
4342
exec->transcount = 1;
4343
do {
4344
/*
4345
* Try to progress as much as possible on the input
4346
*/
4347
if (exec->transcount == atom->max) {
4348
break;
4349
}
4350
exec->index += len;
4351
/*
4352
* End of input: stop here
4353
*/
4354
if (exec->inputString[exec->index] == 0) {
4355
exec->index -= len;
4356
break;
4357
}
4358
if (exec->transcount >= atom->min) {
4359
int transno = exec->transno;
4360
xmlRegStatePtr state = exec->state;
4361
4362
/*
4363
* The transition is acceptable save it
4364
*/
4365
exec->transno = -1; /* trick */
4366
exec->state = to;
4367
xmlFARegExecSave(exec);
4368
exec->transno = transno;
4369
exec->state = state;
4370
}
4371
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4372
len);
4373
ret = xmlRegCheckCharacter(atom, codepoint);
4374
exec->transcount++;
4375
} while (ret == 1);
4376
if (exec->transcount < atom->min)
4377
ret = 0;
4378
4379
/*
4380
* If the last check failed but one transition was found
4381
* possible, rollback
4382
*/
4383
if (ret < 0)
4384
ret = 0;
4385
if (ret == 0) {
4386
goto rollback;
4387
}
4388
}
4389
}
4390
if (ret == 1) {
4391
if (exec->state->nbTrans > exec->transno + 1) {
4392
xmlFARegExecSave(exec);
4393
}
4394
/*
4395
* restart count for expressions like this ((abc){2})*
4396
*/
4397
if (trans->count >= 0) {
4398
exec->counts[trans->count] = 0;
4399
}
4400
if (trans->counter >= 0) {
4401
exec->counts[trans->counter]++;
4402
}
4403
exec->state = exec->comp->states[trans->to];
4404
exec->transno = 0;
4405
if (trans->atom != NULL) {
4406
exec->index += len;
4407
}
4408
goto progress;
4409
} else if (ret < 0) {
4410
exec->status = XML_REGEXP_INTERNAL_ERROR;
4411
break;
4412
}
4413
}
4414
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4415
rollback:
4416
/*
4417
* Failed to find a way out
4418
*/
4419
exec->determinist = 0;
4420
xmlFARegExecRollBack(exec);
4421
}
4422
progress:
4423
continue;
4424
}
4425
}
4426
#endif
4427
/************************************************************************
4428
* *
4429
* Parser for the Schemas Datatype Regular Expressions *
4430
* http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4431
* *
4432
************************************************************************/
4433
4434
/**
4435
* xmlFAIsChar:
4436
* @ctxt: a regexp parser context
4437
*
4438
* [10] Char ::= [^.\?*+()|#x5B#x5D]
4439
*/
4440
static int
4441
xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4442
int cur;
4443
int len;
4444
4445
len = 4;
4446
cur = xmlGetUTF8Char(ctxt->cur, &len);
4447
if (cur < 0) {
4448
ERROR("Invalid UTF-8");
4449
return(0);
4450
}
4451
if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4452
(cur == '*') || (cur == '+') || (cur == '(') ||
4453
(cur == ')') || (cur == '|') || (cur == 0x5B) ||
4454
(cur == 0x5D) || (cur == 0))
4455
return(-1);
4456
return(cur);
4457
}
4458
4459
/**
4460
* xmlFAParseCharProp:
4461
* @ctxt: a regexp parser context
4462
*
4463
* [27] charProp ::= IsCategory | IsBlock
4464
* [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4465
* Separators | Symbols | Others
4466
* [29] Letters ::= 'L' [ultmo]?
4467
* [30] Marks ::= 'M' [nce]?
4468
* [31] Numbers ::= 'N' [dlo]?
4469
* [32] Punctuation ::= 'P' [cdseifo]?
4470
* [33] Separators ::= 'Z' [slp]?
4471
* [34] Symbols ::= 'S' [mcko]?
4472
* [35] Others ::= 'C' [cfon]?
4473
* [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4474
*/
4475
static void
4476
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4477
int cur;
4478
xmlRegAtomType type = (xmlRegAtomType) 0;
4479
xmlChar *blockName = NULL;
4480
4481
cur = CUR;
4482
if (cur == 'L') {
4483
NEXT;
4484
cur = CUR;
4485
if (cur == 'u') {
4486
NEXT;
4487
type = XML_REGEXP_LETTER_UPPERCASE;
4488
} else if (cur == 'l') {
4489
NEXT;
4490
type = XML_REGEXP_LETTER_LOWERCASE;
4491
} else if (cur == 't') {
4492
NEXT;
4493
type = XML_REGEXP_LETTER_TITLECASE;
4494
} else if (cur == 'm') {
4495
NEXT;
4496
type = XML_REGEXP_LETTER_MODIFIER;
4497
} else if (cur == 'o') {
4498
NEXT;
4499
type = XML_REGEXP_LETTER_OTHERS;
4500
} else {
4501
type = XML_REGEXP_LETTER;
4502
}
4503
} else if (cur == 'M') {
4504
NEXT;
4505
cur = CUR;
4506
if (cur == 'n') {
4507
NEXT;
4508
/* nonspacing */
4509
type = XML_REGEXP_MARK_NONSPACING;
4510
} else if (cur == 'c') {
4511
NEXT;
4512
/* spacing combining */
4513
type = XML_REGEXP_MARK_SPACECOMBINING;
4514
} else if (cur == 'e') {
4515
NEXT;
4516
/* enclosing */
4517
type = XML_REGEXP_MARK_ENCLOSING;
4518
} else {
4519
/* all marks */
4520
type = XML_REGEXP_MARK;
4521
}
4522
} else if (cur == 'N') {
4523
NEXT;
4524
cur = CUR;
4525
if (cur == 'd') {
4526
NEXT;
4527
/* digital */
4528
type = XML_REGEXP_NUMBER_DECIMAL;
4529
} else if (cur == 'l') {
4530
NEXT;
4531
/* letter */
4532
type = XML_REGEXP_NUMBER_LETTER;
4533
} else if (cur == 'o') {
4534
NEXT;
4535
/* other */
4536
type = XML_REGEXP_NUMBER_OTHERS;
4537
} else {
4538
/* all numbers */
4539
type = XML_REGEXP_NUMBER;
4540
}
4541
} else if (cur == 'P') {
4542
NEXT;
4543
cur = CUR;
4544
if (cur == 'c') {
4545
NEXT;
4546
/* connector */
4547
type = XML_REGEXP_PUNCT_CONNECTOR;
4548
} else if (cur == 'd') {
4549
NEXT;
4550
/* dash */
4551
type = XML_REGEXP_PUNCT_DASH;
4552
} else if (cur == 's') {
4553
NEXT;
4554
/* open */
4555
type = XML_REGEXP_PUNCT_OPEN;
4556
} else if (cur == 'e') {
4557
NEXT;
4558
/* close */
4559
type = XML_REGEXP_PUNCT_CLOSE;
4560
} else if (cur == 'i') {
4561
NEXT;
4562
/* initial quote */
4563
type = XML_REGEXP_PUNCT_INITQUOTE;
4564
} else if (cur == 'f') {
4565
NEXT;
4566
/* final quote */
4567
type = XML_REGEXP_PUNCT_FINQUOTE;
4568
} else if (cur == 'o') {
4569
NEXT;
4570
/* other */
4571
type = XML_REGEXP_PUNCT_OTHERS;
4572
} else {
4573
/* all punctuation */
4574
type = XML_REGEXP_PUNCT;
4575
}
4576
} else if (cur == 'Z') {
4577
NEXT;
4578
cur = CUR;
4579
if (cur == 's') {
4580
NEXT;
4581
/* space */
4582
type = XML_REGEXP_SEPAR_SPACE;
4583
} else if (cur == 'l') {
4584
NEXT;
4585
/* line */
4586
type = XML_REGEXP_SEPAR_LINE;
4587
} else if (cur == 'p') {
4588
NEXT;
4589
/* paragraph */
4590
type = XML_REGEXP_SEPAR_PARA;
4591
} else {
4592
/* all separators */
4593
type = XML_REGEXP_SEPAR;
4594
}
4595
} else if (cur == 'S') {
4596
NEXT;
4597
cur = CUR;
4598
if (cur == 'm') {
4599
NEXT;
4600
type = XML_REGEXP_SYMBOL_MATH;
4601
/* math */
4602
} else if (cur == 'c') {
4603
NEXT;
4604
type = XML_REGEXP_SYMBOL_CURRENCY;
4605
/* currency */
4606
} else if (cur == 'k') {
4607
NEXT;
4608
type = XML_REGEXP_SYMBOL_MODIFIER;
4609
/* modifiers */
4610
} else if (cur == 'o') {
4611
NEXT;
4612
type = XML_REGEXP_SYMBOL_OTHERS;
4613
/* other */
4614
} else {
4615
/* all symbols */
4616
type = XML_REGEXP_SYMBOL;
4617
}
4618
} else if (cur == 'C') {
4619
NEXT;
4620
cur = CUR;
4621
if (cur == 'c') {
4622
NEXT;
4623
/* control */
4624
type = XML_REGEXP_OTHER_CONTROL;
4625
} else if (cur == 'f') {
4626
NEXT;
4627
/* format */
4628
type = XML_REGEXP_OTHER_FORMAT;
4629
} else if (cur == 'o') {
4630
NEXT;
4631
/* private use */
4632
type = XML_REGEXP_OTHER_PRIVATE;
4633
} else if (cur == 'n') {
4634
NEXT;
4635
/* not assigned */
4636
type = XML_REGEXP_OTHER_NA;
4637
} else {
4638
/* all others */
4639
type = XML_REGEXP_OTHER;
4640
}
4641
} else if (cur == 'I') {
4642
const xmlChar *start;
4643
NEXT;
4644
cur = CUR;
4645
if (cur != 's') {
4646
ERROR("IsXXXX expected");
4647
return;
4648
}
4649
NEXT;
4650
start = ctxt->cur;
4651
cur = CUR;
4652
if (((cur >= 'a') && (cur <= 'z')) ||
4653
((cur >= 'A') && (cur <= 'Z')) ||
4654
((cur >= '0') && (cur <= '9')) ||
4655
(cur == 0x2D)) {
4656
NEXT;
4657
cur = CUR;
4658
while (((cur >= 'a') && (cur <= 'z')) ||
4659
((cur >= 'A') && (cur <= 'Z')) ||
4660
((cur >= '0') && (cur <= '9')) ||
4661
(cur == 0x2D)) {
4662
NEXT;
4663
cur = CUR;
4664
}
4665
}
4666
type = XML_REGEXP_BLOCK_NAME;
4667
blockName = xmlStrndup(start, ctxt->cur - start);
4668
} else {
4669
ERROR("Unknown char property");
4670
return;
4671
}
4672
if (ctxt->atom == NULL) {
4673
ctxt->atom = xmlRegNewAtom(ctxt, type);
4674
if (ctxt->atom == NULL) {
4675
xmlFree(blockName);
4676
return;
4677
}
4678
ctxt->atom->valuep = blockName;
4679
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4680
if (xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4681
type, 0, 0, blockName) == NULL) {
4682
xmlFree(blockName);
4683
}
4684
}
4685
}
4686
4687
static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
4688
{
4689
int val = 0, i, cur;
4690
for (i = 0; i < 4; i++) {
4691
NEXT;
4692
val *= 16;
4693
cur = CUR;
4694
if (cur >= '0' && cur <= '9') {
4695
val += cur - '0';
4696
} else if (cur >= 'A' && cur <= 'F') {
4697
val += cur - 'A' + 10;
4698
} else if (cur >= 'a' && cur <= 'f') {
4699
val += cur - 'a' + 10;
4700
} else {
4701
ERROR("Expecting hex digit");
4702
return -1;
4703
}
4704
}
4705
return val;
4706
}
4707
4708
static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
4709
{
4710
int val = parse_escaped_codeunit(ctxt);
4711
if (0xD800 <= val && val <= 0xDBFF) {
4712
NEXT;
4713
if (CUR == '\\') {
4714
NEXT;
4715
if (CUR == 'u') {
4716
int low = parse_escaped_codeunit(ctxt);
4717
if (0xDC00 <= low && low <= 0xDFFF) {
4718
return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
4719
}
4720
}
4721
}
4722
ERROR("Invalid low surrogate pair code unit");
4723
val = -1;
4724
}
4725
return val;
4726
}
4727
4728
/**
4729
* xmlFAParseCharClassEsc:
4730
* @ctxt: a regexp parser context
4731
*
4732
* [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4733
* [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4734
* [25] catEsc ::= '\p{' charProp '}'
4735
* [26] complEsc ::= '\P{' charProp '}'
4736
* [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4737
*/
4738
static void
4739
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4740
int cur;
4741
4742
if (CUR == '.') {
4743
if (ctxt->atom == NULL) {
4744
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4745
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4746
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4747
XML_REGEXP_ANYCHAR, 0, 0, NULL);
4748
}
4749
NEXT;
4750
return;
4751
}
4752
if (CUR != '\\') {
4753
ERROR("Escaped sequence: expecting \\");
4754
return;
4755
}
4756
NEXT;
4757
cur = CUR;
4758
if (cur == 'p') {
4759
NEXT;
4760
if (CUR != '{') {
4761
ERROR("Expecting '{'");
4762
return;
4763
}
4764
NEXT;
4765
xmlFAParseCharProp(ctxt);
4766
if (CUR != '}') {
4767
ERROR("Expecting '}'");
4768
return;
4769
}
4770
NEXT;
4771
} else if (cur == 'P') {
4772
NEXT;
4773
if (CUR != '{') {
4774
ERROR("Expecting '{'");
4775
return;
4776
}
4777
NEXT;
4778
xmlFAParseCharProp(ctxt);
4779
if (ctxt->atom != NULL)
4780
ctxt->atom->neg = 1;
4781
if (CUR != '}') {
4782
ERROR("Expecting '}'");
4783
return;
4784
}
4785
NEXT;
4786
} else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4787
(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4788
(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4789
(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4790
(cur == 0x5E) ||
4791
4792
/* Non-standard escape sequences:
4793
* Java 1.8|.NET Core 3.1|MSXML 6 */
4794
(cur == '!') || /* + | + | + */
4795
(cur == '"') || /* + | + | + */
4796
(cur == '#') || /* + | + | + */
4797
(cur == '$') || /* + | + | + */
4798
(cur == '%') || /* + | + | + */
4799
(cur == ',') || /* + | + | + */
4800
(cur == '/') || /* + | + | + */
4801
(cur == ':') || /* + | + | + */
4802
(cur == ';') || /* + | + | + */
4803
(cur == '=') || /* + | + | + */
4804
(cur == '>') || /* | + | + */
4805
(cur == '@') || /* + | + | + */
4806
(cur == '`') || /* + | + | + */
4807
(cur == '~') || /* + | + | + */
4808
(cur == 'u')) { /* | + | + */
4809
if (ctxt->atom == NULL) {
4810
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4811
if (ctxt->atom != NULL) {
4812
switch (cur) {
4813
case 'n':
4814
ctxt->atom->codepoint = '\n';
4815
break;
4816
case 'r':
4817
ctxt->atom->codepoint = '\r';
4818
break;
4819
case 't':
4820
ctxt->atom->codepoint = '\t';
4821
break;
4822
case 'u':
4823
cur = parse_escaped_codepoint(ctxt);
4824
if (cur < 0) {
4825
return;
4826
}
4827
ctxt->atom->codepoint = cur;
4828
break;
4829
default:
4830
ctxt->atom->codepoint = cur;
4831
}
4832
}
4833
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4834
switch (cur) {
4835
case 'n':
4836
cur = '\n';
4837
break;
4838
case 'r':
4839
cur = '\r';
4840
break;
4841
case 't':
4842
cur = '\t';
4843
break;
4844
}
4845
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4846
XML_REGEXP_CHARVAL, cur, cur, NULL);
4847
}
4848
NEXT;
4849
} else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4850
(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4851
(cur == 'w') || (cur == 'W')) {
4852
xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4853
4854
switch (cur) {
4855
case 's':
4856
type = XML_REGEXP_ANYSPACE;
4857
break;
4858
case 'S':
4859
type = XML_REGEXP_NOTSPACE;
4860
break;
4861
case 'i':
4862
type = XML_REGEXP_INITNAME;
4863
break;
4864
case 'I':
4865
type = XML_REGEXP_NOTINITNAME;
4866
break;
4867
case 'c':
4868
type = XML_REGEXP_NAMECHAR;
4869
break;
4870
case 'C':
4871
type = XML_REGEXP_NOTNAMECHAR;
4872
break;
4873
case 'd':
4874
type = XML_REGEXP_DECIMAL;
4875
break;
4876
case 'D':
4877
type = XML_REGEXP_NOTDECIMAL;
4878
break;
4879
case 'w':
4880
type = XML_REGEXP_REALCHAR;
4881
break;
4882
case 'W':
4883
type = XML_REGEXP_NOTREALCHAR;
4884
break;
4885
}
4886
NEXT;
4887
if (ctxt->atom == NULL) {
4888
ctxt->atom = xmlRegNewAtom(ctxt, type);
4889
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4890
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4891
type, 0, 0, NULL);
4892
}
4893
} else {
4894
ERROR("Wrong escape sequence, misuse of character '\\'");
4895
}
4896
}
4897
4898
/**
4899
* xmlFAParseCharRange:
4900
* @ctxt: a regexp parser context
4901
*
4902
* [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4903
* [18] seRange ::= charOrEsc '-' charOrEsc
4904
* [20] charOrEsc ::= XmlChar | SingleCharEsc
4905
* [21] XmlChar ::= [^\#x2D#x5B#x5D]
4906
* [22] XmlCharIncDash ::= [^\#x5B#x5D]
4907
*/
4908
static void
4909
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
4910
int cur, len;
4911
int start = -1;
4912
int end = -1;
4913
4914
if (CUR == '\0') {
4915
ERROR("Expecting ']'");
4916
return;
4917
}
4918
4919
cur = CUR;
4920
if (cur == '\\') {
4921
NEXT;
4922
cur = CUR;
4923
switch (cur) {
4924
case 'n': start = 0xA; break;
4925
case 'r': start = 0xD; break;
4926
case 't': start = 0x9; break;
4927
case '\\': case '|': case '.': case '-': case '^': case '?':
4928
case '*': case '+': case '{': case '}': case '(': case ')':
4929
case '[': case ']':
4930
start = cur; break;
4931
default:
4932
ERROR("Invalid escape value");
4933
return;
4934
}
4935
end = start;
4936
len = 1;
4937
} else if ((cur != 0x5B) && (cur != 0x5D)) {
4938
len = 4;
4939
end = start = xmlGetUTF8Char(ctxt->cur, &len);
4940
if (start < 0) {
4941
ERROR("Invalid UTF-8");
4942
return;
4943
}
4944
} else {
4945
ERROR("Expecting a char range");
4946
return;
4947
}
4948
/*
4949
* Since we are "inside" a range, we can assume ctxt->cur is past
4950
* the start of ctxt->string, and PREV should be safe
4951
*/
4952
if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
4953
NEXTL(len);
4954
return;
4955
}
4956
NEXTL(len);
4957
cur = CUR;
4958
if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
4959
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4960
XML_REGEXP_CHARVAL, start, end, NULL);
4961
return;
4962
}
4963
NEXT;
4964
cur = CUR;
4965
if (cur == '\\') {
4966
NEXT;
4967
cur = CUR;
4968
switch (cur) {
4969
case 'n': end = 0xA; break;
4970
case 'r': end = 0xD; break;
4971
case 't': end = 0x9; break;
4972
case '\\': case '|': case '.': case '-': case '^': case '?':
4973
case '*': case '+': case '{': case '}': case '(': case ')':
4974
case '[': case ']':
4975
end = cur; break;
4976
default:
4977
ERROR("Invalid escape value");
4978
return;
4979
}
4980
len = 1;
4981
} else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
4982
len = 4;
4983
end = xmlGetUTF8Char(ctxt->cur, &len);
4984
if (end < 0) {
4985
ERROR("Invalid UTF-8");
4986
return;
4987
}
4988
} else {
4989
ERROR("Expecting the end of a char range");
4990
return;
4991
}
4992
4993
/* TODO check that the values are acceptable character ranges for XML */
4994
if (end < start) {
4995
ERROR("End of range is before start of range");
4996
} else {
4997
NEXTL(len);
4998
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4999
XML_REGEXP_CHARVAL, start, end, NULL);
5000
}
5001
return;
5002
}
5003
5004
/**
5005
* xmlFAParsePosCharGroup:
5006
* @ctxt: a regexp parser context
5007
*
5008
* [14] posCharGroup ::= ( charRange | charClassEsc )+
5009
*/
5010
static void
5011
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5012
do {
5013
if (CUR == '\\') {
5014
xmlFAParseCharClassEsc(ctxt);
5015
} else {
5016
xmlFAParseCharRange(ctxt);
5017
}
5018
} while ((CUR != ']') && (CUR != '-') &&
5019
(CUR != 0) && (ctxt->error == 0));
5020
}
5021
5022
/**
5023
* xmlFAParseCharGroup:
5024
* @ctxt: a regexp parser context
5025
*
5026
* [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5027
* [15] negCharGroup ::= '^' posCharGroup
5028
* [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5029
* [12] charClassExpr ::= '[' charGroup ']'
5030
*/
5031
static void
5032
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5033
int neg = ctxt->neg;
5034
5035
if (CUR == '^') {
5036
NEXT;
5037
ctxt->neg = !ctxt->neg;
5038
xmlFAParsePosCharGroup(ctxt);
5039
ctxt->neg = neg;
5040
}
5041
while ((CUR != ']') && (ctxt->error == 0)) {
5042
if ((CUR == '-') && (NXT(1) == '[')) {
5043
NEXT; /* eat the '-' */
5044
NEXT; /* eat the '[' */
5045
ctxt->neg = 2;
5046
xmlFAParseCharGroup(ctxt);
5047
ctxt->neg = neg;
5048
if (CUR == ']') {
5049
NEXT;
5050
} else {
5051
ERROR("charClassExpr: ']' expected");
5052
}
5053
break;
5054
} else {
5055
xmlFAParsePosCharGroup(ctxt);
5056
}
5057
}
5058
}
5059
5060
/**
5061
* xmlFAParseCharClass:
5062
* @ctxt: a regexp parser context
5063
*
5064
* [11] charClass ::= charClassEsc | charClassExpr
5065
* [12] charClassExpr ::= '[' charGroup ']'
5066
*/
5067
static void
5068
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5069
if (CUR == '[') {
5070
NEXT;
5071
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5072
if (ctxt->atom == NULL)
5073
return;
5074
xmlFAParseCharGroup(ctxt);
5075
if (CUR == ']') {
5076
NEXT;
5077
} else {
5078
ERROR("xmlFAParseCharClass: ']' expected");
5079
}
5080
} else {
5081
xmlFAParseCharClassEsc(ctxt);
5082
}
5083
}
5084
5085
/**
5086
* xmlFAParseQuantExact:
5087
* @ctxt: a regexp parser context
5088
*
5089
* [8] QuantExact ::= [0-9]+
5090
*
5091
* Returns 0 if success or -1 in case of error
5092
*/
5093
static int
5094
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5095
int ret = 0;
5096
int ok = 0;
5097
int overflow = 0;
5098
5099
while ((CUR >= '0') && (CUR <= '9')) {
5100
if (ret > INT_MAX / 10) {
5101
overflow = 1;
5102
} else {
5103
int digit = CUR - '0';
5104
5105
ret *= 10;
5106
if (ret > INT_MAX - digit)
5107
overflow = 1;
5108
else
5109
ret += digit;
5110
}
5111
ok = 1;
5112
NEXT;
5113
}
5114
if ((ok != 1) || (overflow == 1)) {
5115
return(-1);
5116
}
5117
return(ret);
5118
}
5119
5120
/**
5121
* xmlFAParseQuantifier:
5122
* @ctxt: a regexp parser context
5123
*
5124
* [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5125
* [5] quantity ::= quantRange | quantMin | QuantExact
5126
* [6] quantRange ::= QuantExact ',' QuantExact
5127
* [7] quantMin ::= QuantExact ','
5128
* [8] QuantExact ::= [0-9]+
5129
*/
5130
static int
5131
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5132
int cur;
5133
5134
cur = CUR;
5135
if ((cur == '?') || (cur == '*') || (cur == '+')) {
5136
if (ctxt->atom != NULL) {
5137
if (cur == '?')
5138
ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5139
else if (cur == '*')
5140
ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5141
else if (cur == '+')
5142
ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5143
}
5144
NEXT;
5145
return(1);
5146
}
5147
if (cur == '{') {
5148
int min = 0, max = 0;
5149
5150
NEXT;
5151
cur = xmlFAParseQuantExact(ctxt);
5152
if (cur >= 0)
5153
min = cur;
5154
else {
5155
ERROR("Improper quantifier");
5156
}
5157
if (CUR == ',') {
5158
NEXT;
5159
if (CUR == '}')
5160
max = INT_MAX;
5161
else {
5162
cur = xmlFAParseQuantExact(ctxt);
5163
if (cur >= 0)
5164
max = cur;
5165
else {
5166
ERROR("Improper quantifier");
5167
}
5168
}
5169
}
5170
if (CUR == '}') {
5171
NEXT;
5172
} else {
5173
ERROR("Unterminated quantifier");
5174
}
5175
if (max == 0)
5176
max = min;
5177
if (ctxt->atom != NULL) {
5178
ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5179
ctxt->atom->min = min;
5180
ctxt->atom->max = max;
5181
}
5182
return(1);
5183
}
5184
return(0);
5185
}
5186
5187
/**
5188
* xmlFAParseAtom:
5189
* @ctxt: a regexp parser context
5190
*
5191
* [9] atom ::= Char | charClass | ( '(' regExp ')' )
5192
*/
5193
static int
5194
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5195
int codepoint, len;
5196
5197
codepoint = xmlFAIsChar(ctxt);
5198
if (codepoint > 0) {
5199
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5200
if (ctxt->atom == NULL)
5201
return(-1);
5202
len = 4;
5203
codepoint = xmlGetUTF8Char(ctxt->cur, &len);
5204
if (codepoint < 0) {
5205
ERROR("Invalid UTF-8");
5206
return(-1);
5207
}
5208
ctxt->atom->codepoint = codepoint;
5209
NEXTL(len);
5210
return(1);
5211
} else if (CUR == '|') {
5212
return(0);
5213
} else if (CUR == 0) {
5214
return(0);
5215
} else if (CUR == ')') {
5216
return(0);
5217
} else if (CUR == '(') {
5218
xmlRegStatePtr start, oldend, start0;
5219
5220
NEXT;
5221
if (ctxt->depth >= 50) {
5222
ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5223
return(-1);
5224
}
5225
/*
5226
* this extra Epsilon transition is needed if we count with 0 allowed
5227
* unfortunately this can't be known at that point
5228
*/
5229
xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5230
start0 = ctxt->state;
5231
xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5232
start = ctxt->state;
5233
oldend = ctxt->end;
5234
ctxt->end = NULL;
5235
ctxt->atom = NULL;
5236
ctxt->depth++;
5237
xmlFAParseRegExp(ctxt, 0);
5238
ctxt->depth--;
5239
if (CUR == ')') {
5240
NEXT;
5241
} else {
5242
ERROR("xmlFAParseAtom: expecting ')'");
5243
}
5244
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5245
if (ctxt->atom == NULL)
5246
return(-1);
5247
ctxt->atom->start = start;
5248
ctxt->atom->start0 = start0;
5249
ctxt->atom->stop = ctxt->state;
5250
ctxt->end = oldend;
5251
return(1);
5252
} else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5253
xmlFAParseCharClass(ctxt);
5254
return(1);
5255
}
5256
return(0);
5257
}
5258
5259
/**
5260
* xmlFAParsePiece:
5261
* @ctxt: a regexp parser context
5262
*
5263
* [3] piece ::= atom quantifier?
5264
*/
5265
static int
5266
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5267
int ret;
5268
5269
ctxt->atom = NULL;
5270
ret = xmlFAParseAtom(ctxt);
5271
if (ret == 0)
5272
return(0);
5273
if (ctxt->atom == NULL) {
5274
ERROR("internal: no atom generated");
5275
}
5276
xmlFAParseQuantifier(ctxt);
5277
return(1);
5278
}
5279
5280
/**
5281
* xmlFAParseBranch:
5282
* @ctxt: a regexp parser context
5283
* @to: optional target to the end of the branch
5284
*
5285
* @to is used to optimize by removing duplicate path in automata
5286
* in expressions like (a|b)(c|d)
5287
*
5288
* [2] branch ::= piece*
5289
*/
5290
static int
5291
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5292
xmlRegStatePtr previous;
5293
int ret;
5294
5295
previous = ctxt->state;
5296
ret = xmlFAParsePiece(ctxt);
5297
if (ret == 0) {
5298
/* Empty branch */
5299
xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5300
} else {
5301
if (xmlFAGenerateTransitions(ctxt, previous,
5302
(CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5303
ctxt->atom) < 0) {
5304
xmlRegFreeAtom(ctxt->atom);
5305
ctxt->atom = NULL;
5306
return(-1);
5307
}
5308
previous = ctxt->state;
5309
ctxt->atom = NULL;
5310
}
5311
while ((ret != 0) && (ctxt->error == 0)) {
5312
ret = xmlFAParsePiece(ctxt);
5313
if (ret != 0) {
5314
if (xmlFAGenerateTransitions(ctxt, previous,
5315
(CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5316
ctxt->atom) < 0) {
5317
xmlRegFreeAtom(ctxt->atom);
5318
ctxt->atom = NULL;
5319
return(-1);
5320
}
5321
previous = ctxt->state;
5322
ctxt->atom = NULL;
5323
}
5324
}
5325
return(0);
5326
}
5327
5328
/**
5329
* xmlFAParseRegExp:
5330
* @ctxt: a regexp parser context
5331
* @top: is this the top-level expression ?
5332
*
5333
* [1] regExp ::= branch ( '|' branch )*
5334
*/
5335
static void
5336
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5337
xmlRegStatePtr start, end;
5338
5339
/* if not top start should have been generated by an epsilon trans */
5340
start = ctxt->state;
5341
ctxt->end = NULL;
5342
xmlFAParseBranch(ctxt, NULL);
5343
if (top) {
5344
ctxt->state->type = XML_REGEXP_FINAL_STATE;
5345
}
5346
if (CUR != '|') {
5347
ctxt->end = ctxt->state;
5348
return;
5349
}
5350
end = ctxt->state;
5351
while ((CUR == '|') && (ctxt->error == 0)) {
5352
NEXT;
5353
ctxt->state = start;
5354
ctxt->end = NULL;
5355
xmlFAParseBranch(ctxt, end);
5356
}
5357
if (!top) {
5358
ctxt->state = end;
5359
ctxt->end = end;
5360
}
5361
}
5362
5363
/************************************************************************
5364
* *
5365
* The basic API *
5366
* *
5367
************************************************************************/
5368
5369
/**
5370
* xmlRegexpPrint:
5371
* @output: the file for the output debug
5372
* @regexp: the compiled regexp
5373
*
5374
* Print the content of the compiled regular expression
5375
*/
5376
void
5377
xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5378
int i;
5379
5380
if (output == NULL)
5381
return;
5382
fprintf(output, " regexp: ");
5383
if (regexp == NULL) {
5384
fprintf(output, "NULL\n");
5385
return;
5386
}
5387
fprintf(output, "'%s' ", regexp->string);
5388
fprintf(output, "\n");
5389
fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5390
for (i = 0;i < regexp->nbAtoms; i++) {
5391
fprintf(output, " %02d ", i);
5392
xmlRegPrintAtom(output, regexp->atoms[i]);
5393
}
5394
fprintf(output, "%d states:", regexp->nbStates);
5395
fprintf(output, "\n");
5396
for (i = 0;i < regexp->nbStates; i++) {
5397
xmlRegPrintState(output, regexp->states[i]);
5398
}
5399
fprintf(output, "%d counters:\n", regexp->nbCounters);
5400
for (i = 0;i < regexp->nbCounters; i++) {
5401
fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5402
regexp->counters[i].max);
5403
}
5404
}
5405
5406
/**
5407
* xmlRegexpCompile:
5408
* @regexp: a regular expression string
5409
*
5410
* Parses a regular expression conforming to XML Schemas Part 2 Datatype
5411
* Appendix F and builds an automata suitable for testing strings against
5412
* that regular expression
5413
*
5414
* Returns the compiled expression or NULL in case of error
5415
*/
5416
xmlRegexpPtr
5417
xmlRegexpCompile(const xmlChar *regexp) {
5418
xmlRegexpPtr ret = NULL;
5419
xmlRegParserCtxtPtr ctxt;
5420
5421
if (regexp == NULL)
5422
return(NULL);
5423
5424
ctxt = xmlRegNewParserCtxt(regexp);
5425
if (ctxt == NULL)
5426
return(NULL);
5427
5428
/* initialize the parser */
5429
ctxt->state = xmlRegStatePush(ctxt);
5430
if (ctxt->state == NULL)
5431
goto error;
5432
ctxt->start = ctxt->state;
5433
ctxt->end = NULL;
5434
5435
/* parse the expression building an automata */
5436
xmlFAParseRegExp(ctxt, 1);
5437
if (CUR != 0) {
5438
ERROR("xmlFAParseRegExp: extra characters");
5439
}
5440
if (ctxt->error != 0)
5441
goto error;
5442
ctxt->end = ctxt->state;
5443
ctxt->start->type = XML_REGEXP_START_STATE;
5444
ctxt->end->type = XML_REGEXP_FINAL_STATE;
5445
5446
/* remove the Epsilon except for counted transitions */
5447
xmlFAEliminateEpsilonTransitions(ctxt);
5448
5449
5450
if (ctxt->error != 0)
5451
goto error;
5452
ret = xmlRegEpxFromParse(ctxt);
5453
5454
error:
5455
xmlRegFreeParserCtxt(ctxt);
5456
return(ret);
5457
}
5458
5459
/**
5460
* xmlRegexpExec:
5461
* @comp: the compiled regular expression
5462
* @content: the value to check against the regular expression
5463
*
5464
* Check if the regular expression generates the value
5465
*
5466
* Returns 1 if it matches, 0 if not and a negative value in case of error
5467
*/
5468
int
5469
xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5470
if ((comp == NULL) || (content == NULL))
5471
return(-1);
5472
return(xmlFARegExec(comp, content));
5473
}
5474
5475
/**
5476
* xmlRegexpIsDeterminist:
5477
* @comp: the compiled regular expression
5478
*
5479
* Check if the regular expression is determinist
5480
*
5481
* Returns 1 if it yes, 0 if not and a negative value in case of error
5482
*/
5483
int
5484
xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5485
xmlAutomataPtr am;
5486
int ret;
5487
5488
if (comp == NULL)
5489
return(-1);
5490
if (comp->determinist != -1)
5491
return(comp->determinist);
5492
5493
am = xmlNewAutomata();
5494
if (am == NULL)
5495
return(-1);
5496
if (am->states != NULL) {
5497
int i;
5498
5499
for (i = 0;i < am->nbStates;i++)
5500
xmlRegFreeState(am->states[i]);
5501
xmlFree(am->states);
5502
}
5503
am->nbAtoms = comp->nbAtoms;
5504
am->atoms = comp->atoms;
5505
am->nbStates = comp->nbStates;
5506
am->states = comp->states;
5507
am->determinist = -1;
5508
am->flags = comp->flags;
5509
ret = xmlFAComputesDeterminism(am);
5510
am->atoms = NULL;
5511
am->states = NULL;
5512
xmlFreeAutomata(am);
5513
comp->determinist = ret;
5514
return(ret);
5515
}
5516
5517
/**
5518
* xmlRegFreeRegexp:
5519
* @regexp: the regexp
5520
*
5521
* Free a regexp
5522
*/
5523
void
5524
xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5525
int i;
5526
if (regexp == NULL)
5527
return;
5528
5529
if (regexp->string != NULL)
5530
xmlFree(regexp->string);
5531
if (regexp->states != NULL) {
5532
for (i = 0;i < regexp->nbStates;i++)
5533
xmlRegFreeState(regexp->states[i]);
5534
xmlFree(regexp->states);
5535
}
5536
if (regexp->atoms != NULL) {
5537
for (i = 0;i < regexp->nbAtoms;i++)
5538
xmlRegFreeAtom(regexp->atoms[i]);
5539
xmlFree(regexp->atoms);
5540
}
5541
if (regexp->counters != NULL)
5542
xmlFree(regexp->counters);
5543
if (regexp->compact != NULL)
5544
xmlFree(regexp->compact);
5545
if (regexp->transdata != NULL)
5546
xmlFree(regexp->transdata);
5547
if (regexp->stringMap != NULL) {
5548
for (i = 0; i < regexp->nbstrings;i++)
5549
xmlFree(regexp->stringMap[i]);
5550
xmlFree(regexp->stringMap);
5551
}
5552
5553
xmlFree(regexp);
5554
}
5555
5556
#ifdef LIBXML_AUTOMATA_ENABLED
5557
/************************************************************************
5558
* *
5559
* The Automata interface *
5560
* *
5561
************************************************************************/
5562
5563
/**
5564
* xmlNewAutomata:
5565
*
5566
* Create a new automata
5567
*
5568
* Returns the new object or NULL in case of failure
5569
*/
5570
xmlAutomataPtr
5571
xmlNewAutomata(void) {
5572
xmlAutomataPtr ctxt;
5573
5574
ctxt = xmlRegNewParserCtxt(NULL);
5575
if (ctxt == NULL)
5576
return(NULL);
5577
5578
/* initialize the parser */
5579
ctxt->state = xmlRegStatePush(ctxt);
5580
if (ctxt->state == NULL) {
5581
xmlFreeAutomata(ctxt);
5582
return(NULL);
5583
}
5584
ctxt->start = ctxt->state;
5585
ctxt->end = NULL;
5586
5587
ctxt->start->type = XML_REGEXP_START_STATE;
5588
ctxt->flags = 0;
5589
5590
return(ctxt);
5591
}
5592
5593
/**
5594
* xmlFreeAutomata:
5595
* @am: an automata
5596
*
5597
* Free an automata
5598
*/
5599
void
5600
xmlFreeAutomata(xmlAutomataPtr am) {
5601
if (am == NULL)
5602
return;
5603
xmlRegFreeParserCtxt(am);
5604
}
5605
5606
/**
5607
* xmlAutomataSetFlags:
5608
* @am: an automata
5609
* @flags: a set of internal flags
5610
*
5611
* Set some flags on the automata
5612
*/
5613
void
5614
xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5615
if (am == NULL)
5616
return;
5617
am->flags |= flags;
5618
}
5619
5620
/**
5621
* xmlAutomataGetInitState:
5622
* @am: an automata
5623
*
5624
* Initial state lookup
5625
*
5626
* Returns the initial state of the automata
5627
*/
5628
xmlAutomataStatePtr
5629
xmlAutomataGetInitState(xmlAutomataPtr am) {
5630
if (am == NULL)
5631
return(NULL);
5632
return(am->start);
5633
}
5634
5635
/**
5636
* xmlAutomataSetFinalState:
5637
* @am: an automata
5638
* @state: a state in this automata
5639
*
5640
* Makes that state a final state
5641
*
5642
* Returns 0 or -1 in case of error
5643
*/
5644
int
5645
xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5646
if ((am == NULL) || (state == NULL))
5647
return(-1);
5648
state->type = XML_REGEXP_FINAL_STATE;
5649
return(0);
5650
}
5651
5652
/**
5653
* xmlAutomataNewTransition:
5654
* @am: an automata
5655
* @from: the starting point of the transition
5656
* @to: the target point of the transition or NULL
5657
* @token: the input string associated to that transition
5658
* @data: data passed to the callback function if the transition is activated
5659
*
5660
* If @to is NULL, this creates first a new target state in the automata
5661
* and then adds a transition from the @from state to the target state
5662
* activated by the value of @token
5663
*
5664
* Returns the target state or NULL in case of error
5665
*/
5666
xmlAutomataStatePtr
5667
xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5668
xmlAutomataStatePtr to, const xmlChar *token,
5669
void *data) {
5670
xmlRegAtomPtr atom;
5671
5672
if ((am == NULL) || (from == NULL) || (token == NULL))
5673
return(NULL);
5674
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5675
if (atom == NULL)
5676
return(NULL);
5677
atom->data = data;
5678
atom->valuep = xmlStrdup(token);
5679
5680
if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5681
xmlRegFreeAtom(atom);
5682
return(NULL);
5683
}
5684
if (to == NULL)
5685
return(am->state);
5686
return(to);
5687
}
5688
5689
/**
5690
* xmlAutomataNewTransition2:
5691
* @am: an automata
5692
* @from: the starting point of the transition
5693
* @to: the target point of the transition or NULL
5694
* @token: the first input string associated to that transition
5695
* @token2: the second input string associated to that transition
5696
* @data: data passed to the callback function if the transition is activated
5697
*
5698
* If @to is NULL, this creates first a new target state in the automata
5699
* and then adds a transition from the @from state to the target state
5700
* activated by the value of @token
5701
*
5702
* Returns the target state or NULL in case of error
5703
*/
5704
xmlAutomataStatePtr
5705
xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5706
xmlAutomataStatePtr to, const xmlChar *token,
5707
const xmlChar *token2, void *data) {
5708
xmlRegAtomPtr atom;
5709
5710
if ((am == NULL) || (from == NULL) || (token == NULL))
5711
return(NULL);
5712
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5713
if (atom == NULL)
5714
return(NULL);
5715
atom->data = data;
5716
if ((token2 == NULL) || (*token2 == 0)) {
5717
atom->valuep = xmlStrdup(token);
5718
} else {
5719
int lenn, lenp;
5720
xmlChar *str;
5721
5722
lenn = strlen((char *) token2);
5723
lenp = strlen((char *) token);
5724
5725
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5726
if (str == NULL) {
5727
xmlRegFreeAtom(atom);
5728
return(NULL);
5729
}
5730
memcpy(&str[0], token, lenp);
5731
str[lenp] = '|';
5732
memcpy(&str[lenp + 1], token2, lenn);
5733
str[lenn + lenp + 1] = 0;
5734
5735
atom->valuep = str;
5736
}
5737
5738
if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5739
xmlRegFreeAtom(atom);
5740
return(NULL);
5741
}
5742
if (to == NULL)
5743
return(am->state);
5744
return(to);
5745
}
5746
5747
/**
5748
* xmlAutomataNewNegTrans:
5749
* @am: an automata
5750
* @from: the starting point of the transition
5751
* @to: the target point of the transition or NULL
5752
* @token: the first input string associated to that transition
5753
* @token2: the second input string associated to that transition
5754
* @data: data passed to the callback function if the transition is activated
5755
*
5756
* If @to is NULL, this creates first a new target state in the automata
5757
* and then adds a transition from the @from state to the target state
5758
* activated by any value except (@token,@token2)
5759
* Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5760
# the semantic of XSD ##other
5761
*
5762
* Returns the target state or NULL in case of error
5763
*/
5764
xmlAutomataStatePtr
5765
xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5766
xmlAutomataStatePtr to, const xmlChar *token,
5767
const xmlChar *token2, void *data) {
5768
xmlRegAtomPtr atom;
5769
xmlChar err_msg[200];
5770
5771
if ((am == NULL) || (from == NULL) || (token == NULL))
5772
return(NULL);
5773
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5774
if (atom == NULL)
5775
return(NULL);
5776
atom->data = data;
5777
atom->neg = 1;
5778
if ((token2 == NULL) || (*token2 == 0)) {
5779
atom->valuep = xmlStrdup(token);
5780
} else {
5781
int lenn, lenp;
5782
xmlChar *str;
5783
5784
lenn = strlen((char *) token2);
5785
lenp = strlen((char *) token);
5786
5787
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5788
if (str == NULL) {
5789
xmlRegFreeAtom(atom);
5790
return(NULL);
5791
}
5792
memcpy(&str[0], token, lenp);
5793
str[lenp] = '|';
5794
memcpy(&str[lenp + 1], token2, lenn);
5795
str[lenn + lenp + 1] = 0;
5796
5797
atom->valuep = str;
5798
}
5799
snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5800
err_msg[199] = 0;
5801
atom->valuep2 = xmlStrdup(err_msg);
5802
5803
if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5804
xmlRegFreeAtom(atom);
5805
return(NULL);
5806
}
5807
am->negs++;
5808
if (to == NULL)
5809
return(am->state);
5810
return(to);
5811
}
5812
5813
/**
5814
* xmlAutomataNewCountTrans2:
5815
* @am: an automata
5816
* @from: the starting point of the transition
5817
* @to: the target point of the transition or NULL
5818
* @token: the input string associated to that transition
5819
* @token2: the second input string associated to that transition
5820
* @min: the minimum successive occurrences of token
5821
* @max: the maximum successive occurrences of token
5822
* @data: data associated to the transition
5823
*
5824
* If @to is NULL, this creates first a new target state in the automata
5825
* and then adds a transition from the @from state to the target state
5826
* activated by a succession of input of value @token and @token2 and
5827
* whose number is between @min and @max
5828
*
5829
* Returns the target state or NULL in case of error
5830
*/
5831
xmlAutomataStatePtr
5832
xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5833
xmlAutomataStatePtr to, const xmlChar *token,
5834
const xmlChar *token2,
5835
int min, int max, void *data) {
5836
xmlRegAtomPtr atom;
5837
int counter;
5838
5839
if ((am == NULL) || (from == NULL) || (token == NULL))
5840
return(NULL);
5841
if (min < 0)
5842
return(NULL);
5843
if ((max < min) || (max < 1))
5844
return(NULL);
5845
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5846
if (atom == NULL)
5847
return(NULL);
5848
if ((token2 == NULL) || (*token2 == 0)) {
5849
atom->valuep = xmlStrdup(token);
5850
if (atom->valuep == NULL)
5851
goto error;
5852
} else {
5853
int lenn, lenp;
5854
xmlChar *str;
5855
5856
lenn = strlen((char *) token2);
5857
lenp = strlen((char *) token);
5858
5859
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5860
if (str == NULL)
5861
goto error;
5862
memcpy(&str[0], token, lenp);
5863
str[lenp] = '|';
5864
memcpy(&str[lenp + 1], token2, lenn);
5865
str[lenn + lenp + 1] = 0;
5866
5867
atom->valuep = str;
5868
}
5869
atom->data = data;
5870
if (min == 0)
5871
atom->min = 1;
5872
else
5873
atom->min = min;
5874
atom->max = max;
5875
5876
/*
5877
* associate a counter to the transition.
5878
*/
5879
counter = xmlRegGetCounter(am);
5880
if (counter < 0)
5881
goto error;
5882
am->counters[counter].min = min;
5883
am->counters[counter].max = max;
5884
5885
/* xmlFAGenerateTransitions(am, from, to, atom); */
5886
if (to == NULL) {
5887
to = xmlRegStatePush(am);
5888
if (to == NULL)
5889
goto error;
5890
}
5891
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5892
if (xmlRegAtomPush(am, atom) < 0)
5893
goto error;
5894
am->state = to;
5895
5896
if (to == NULL)
5897
to = am->state;
5898
if (to == NULL)
5899
return(NULL);
5900
if (min == 0)
5901
xmlFAGenerateEpsilonTransition(am, from, to);
5902
return(to);
5903
5904
error:
5905
xmlRegFreeAtom(atom);
5906
return(NULL);
5907
}
5908
5909
/**
5910
* xmlAutomataNewCountTrans:
5911
* @am: an automata
5912
* @from: the starting point of the transition
5913
* @to: the target point of the transition or NULL
5914
* @token: the input string associated to that transition
5915
* @min: the minimum successive occurrences of token
5916
* @max: the maximum successive occurrences of token
5917
* @data: data associated to the transition
5918
*
5919
* If @to is NULL, this creates first a new target state in the automata
5920
* and then adds a transition from the @from state to the target state
5921
* activated by a succession of input of value @token and whose number
5922
* is between @min and @max
5923
*
5924
* Returns the target state or NULL in case of error
5925
*/
5926
xmlAutomataStatePtr
5927
xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5928
xmlAutomataStatePtr to, const xmlChar *token,
5929
int min, int max, void *data) {
5930
xmlRegAtomPtr atom;
5931
int counter;
5932
5933
if ((am == NULL) || (from == NULL) || (token == NULL))
5934
return(NULL);
5935
if (min < 0)
5936
return(NULL);
5937
if ((max < min) || (max < 1))
5938
return(NULL);
5939
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5940
if (atom == NULL)
5941
return(NULL);
5942
atom->valuep = xmlStrdup(token);
5943
if (atom->valuep == NULL)
5944
goto error;
5945
atom->data = data;
5946
if (min == 0)
5947
atom->min = 1;
5948
else
5949
atom->min = min;
5950
atom->max = max;
5951
5952
/*
5953
* associate a counter to the transition.
5954
*/
5955
counter = xmlRegGetCounter(am);
5956
if (counter < 0)
5957
goto error;
5958
am->counters[counter].min = min;
5959
am->counters[counter].max = max;
5960
5961
/* xmlFAGenerateTransitions(am, from, to, atom); */
5962
if (to == NULL) {
5963
to = xmlRegStatePush(am);
5964
if (to == NULL)
5965
goto error;
5966
}
5967
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5968
if (xmlRegAtomPush(am, atom) < 0)
5969
goto error;
5970
am->state = to;
5971
5972
if (to == NULL)
5973
to = am->state;
5974
if (to == NULL)
5975
return(NULL);
5976
if (min == 0)
5977
xmlFAGenerateEpsilonTransition(am, from, to);
5978
return(to);
5979
5980
error:
5981
xmlRegFreeAtom(atom);
5982
return(NULL);
5983
}
5984
5985
/**
5986
* xmlAutomataNewOnceTrans2:
5987
* @am: an automata
5988
* @from: the starting point of the transition
5989
* @to: the target point of the transition or NULL
5990
* @token: the input string associated to that transition
5991
* @token2: the second input string associated to that transition
5992
* @min: the minimum successive occurrences of token
5993
* @max: the maximum successive occurrences of token
5994
* @data: data associated to the transition
5995
*
5996
* If @to is NULL, this creates first a new target state in the automata
5997
* and then adds a transition from the @from state to the target state
5998
* activated by a succession of input of value @token and @token2 and whose
5999
* number is between @min and @max, moreover that transition can only be
6000
* crossed once.
6001
*
6002
* Returns the target state or NULL in case of error
6003
*/
6004
xmlAutomataStatePtr
6005
xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6006
xmlAutomataStatePtr to, const xmlChar *token,
6007
const xmlChar *token2,
6008
int min, int max, void *data) {
6009
xmlRegAtomPtr atom;
6010
int counter;
6011
6012
if ((am == NULL) || (from == NULL) || (token == NULL))
6013
return(NULL);
6014
if (min < 1)
6015
return(NULL);
6016
if (max < min)
6017
return(NULL);
6018
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6019
if (atom == NULL)
6020
return(NULL);
6021
if ((token2 == NULL) || (*token2 == 0)) {
6022
atom->valuep = xmlStrdup(token);
6023
if (atom->valuep == NULL)
6024
goto error;
6025
} else {
6026
int lenn, lenp;
6027
xmlChar *str;
6028
6029
lenn = strlen((char *) token2);
6030
lenp = strlen((char *) token);
6031
6032
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6033
if (str == NULL)
6034
goto error;
6035
memcpy(&str[0], token, lenp);
6036
str[lenp] = '|';
6037
memcpy(&str[lenp + 1], token2, lenn);
6038
str[lenn + lenp + 1] = 0;
6039
6040
atom->valuep = str;
6041
}
6042
atom->data = data;
6043
atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6044
atom->min = min;
6045
atom->max = max;
6046
/*
6047
* associate a counter to the transition.
6048
*/
6049
counter = xmlRegGetCounter(am);
6050
if (counter < 0)
6051
goto error;
6052
am->counters[counter].min = 1;
6053
am->counters[counter].max = 1;
6054
6055
/* xmlFAGenerateTransitions(am, from, to, atom); */
6056
if (to == NULL) {
6057
to = xmlRegStatePush(am);
6058
if (to == NULL)
6059
goto error;
6060
}
6061
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6062
if (xmlRegAtomPush(am, atom) < 0)
6063
goto error;
6064
am->state = to;
6065
return(to);
6066
6067
error:
6068
xmlRegFreeAtom(atom);
6069
return(NULL);
6070
}
6071
6072
6073
6074
/**
6075
* xmlAutomataNewOnceTrans:
6076
* @am: an automata
6077
* @from: the starting point of the transition
6078
* @to: the target point of the transition or NULL
6079
* @token: the input string associated to that transition
6080
* @min: the minimum successive occurrences of token
6081
* @max: the maximum successive occurrences of token
6082
* @data: data associated to the transition
6083
*
6084
* If @to is NULL, this creates first a new target state in the automata
6085
* and then adds a transition from the @from state to the target state
6086
* activated by a succession of input of value @token and whose number
6087
* is between @min and @max, moreover that transition can only be crossed
6088
* once.
6089
*
6090
* Returns the target state or NULL in case of error
6091
*/
6092
xmlAutomataStatePtr
6093
xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6094
xmlAutomataStatePtr to, const xmlChar *token,
6095
int min, int max, void *data) {
6096
xmlRegAtomPtr atom;
6097
int counter;
6098
6099
if ((am == NULL) || (from == NULL) || (token == NULL))
6100
return(NULL);
6101
if (min < 1)
6102
return(NULL);
6103
if (max < min)
6104
return(NULL);
6105
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6106
if (atom == NULL)
6107
return(NULL);
6108
atom->valuep = xmlStrdup(token);
6109
atom->data = data;
6110
atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6111
atom->min = min;
6112
atom->max = max;
6113
/*
6114
* associate a counter to the transition.
6115
*/
6116
counter = xmlRegGetCounter(am);
6117
if (counter < 0)
6118
goto error;
6119
am->counters[counter].min = 1;
6120
am->counters[counter].max = 1;
6121
6122
/* xmlFAGenerateTransitions(am, from, to, atom); */
6123
if (to == NULL) {
6124
to = xmlRegStatePush(am);
6125
if (to == NULL)
6126
goto error;
6127
}
6128
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6129
if (xmlRegAtomPush(am, atom) < 0)
6130
goto error;
6131
am->state = to;
6132
return(to);
6133
6134
error:
6135
xmlRegFreeAtom(atom);
6136
return(NULL);
6137
}
6138
6139
/**
6140
* xmlAutomataNewState:
6141
* @am: an automata
6142
*
6143
* Create a new disconnected state in the automata
6144
*
6145
* Returns the new state or NULL in case of error
6146
*/
6147
xmlAutomataStatePtr
6148
xmlAutomataNewState(xmlAutomataPtr am) {
6149
if (am == NULL)
6150
return(NULL);
6151
return(xmlRegStatePush(am));
6152
}
6153
6154
/**
6155
* xmlAutomataNewEpsilon:
6156
* @am: an automata
6157
* @from: the starting point of the transition
6158
* @to: the target point of the transition or NULL
6159
*
6160
* If @to is NULL, this creates first a new target state in the automata
6161
* and then adds an epsilon transition from the @from state to the
6162
* target state
6163
*
6164
* Returns the target state or NULL in case of error
6165
*/
6166
xmlAutomataStatePtr
6167
xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6168
xmlAutomataStatePtr to) {
6169
if ((am == NULL) || (from == NULL))
6170
return(NULL);
6171
xmlFAGenerateEpsilonTransition(am, from, to);
6172
if (to == NULL)
6173
return(am->state);
6174
return(to);
6175
}
6176
6177
/**
6178
* xmlAutomataNewAllTrans:
6179
* @am: an automata
6180
* @from: the starting point of the transition
6181
* @to: the target point of the transition or NULL
6182
* @lax: allow to transition if not all all transitions have been activated
6183
*
6184
* If @to is NULL, this creates first a new target state in the automata
6185
* and then adds a an ALL transition from the @from state to the
6186
* target state. That transition is an epsilon transition allowed only when
6187
* all transitions from the @from node have been activated.
6188
*
6189
* Returns the target state or NULL in case of error
6190
*/
6191
xmlAutomataStatePtr
6192
xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6193
xmlAutomataStatePtr to, int lax) {
6194
if ((am == NULL) || (from == NULL))
6195
return(NULL);
6196
xmlFAGenerateAllTransition(am, from, to, lax);
6197
if (to == NULL)
6198
return(am->state);
6199
return(to);
6200
}
6201
6202
/**
6203
* xmlAutomataNewCounter:
6204
* @am: an automata
6205
* @min: the minimal value on the counter
6206
* @max: the maximal value on the counter
6207
*
6208
* Create a new counter
6209
*
6210
* Returns the counter number or -1 in case of error
6211
*/
6212
int
6213
xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6214
int ret;
6215
6216
if (am == NULL)
6217
return(-1);
6218
6219
ret = xmlRegGetCounter(am);
6220
if (ret < 0)
6221
return(-1);
6222
am->counters[ret].min = min;
6223
am->counters[ret].max = max;
6224
return(ret);
6225
}
6226
6227
/**
6228
* xmlAutomataNewCountedTrans:
6229
* @am: an automata
6230
* @from: the starting point of the transition
6231
* @to: the target point of the transition or NULL
6232
* @counter: the counter associated to that transition
6233
*
6234
* If @to is NULL, this creates first a new target state in the automata
6235
* and then adds an epsilon transition from the @from state to the target state
6236
* which will increment the counter provided
6237
*
6238
* Returns the target state or NULL in case of error
6239
*/
6240
xmlAutomataStatePtr
6241
xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6242
xmlAutomataStatePtr to, int counter) {
6243
if ((am == NULL) || (from == NULL) || (counter < 0))
6244
return(NULL);
6245
xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6246
if (to == NULL)
6247
return(am->state);
6248
return(to);
6249
}
6250
6251
/**
6252
* xmlAutomataNewCounterTrans:
6253
* @am: an automata
6254
* @from: the starting point of the transition
6255
* @to: the target point of the transition or NULL
6256
* @counter: the counter associated to that transition
6257
*
6258
* If @to is NULL, this creates first a new target state in the automata
6259
* and then adds an epsilon transition from the @from state to the target state
6260
* which will be allowed only if the counter is within the right range.
6261
*
6262
* Returns the target state or NULL in case of error
6263
*/
6264
xmlAutomataStatePtr
6265
xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6266
xmlAutomataStatePtr to, int counter) {
6267
if ((am == NULL) || (from == NULL) || (counter < 0))
6268
return(NULL);
6269
xmlFAGenerateCountedTransition(am, from, to, counter);
6270
if (to == NULL)
6271
return(am->state);
6272
return(to);
6273
}
6274
6275
/**
6276
* xmlAutomataCompile:
6277
* @am: an automata
6278
*
6279
* Compile the automata into a Reg Exp ready for being executed.
6280
* The automata should be free after this point.
6281
*
6282
* Returns the compiled regexp or NULL in case of error
6283
*/
6284
xmlRegexpPtr
6285
xmlAutomataCompile(xmlAutomataPtr am) {
6286
xmlRegexpPtr ret;
6287
6288
if ((am == NULL) || (am->error != 0)) return(NULL);
6289
xmlFAEliminateEpsilonTransitions(am);
6290
/* xmlFAComputesDeterminism(am); */
6291
ret = xmlRegEpxFromParse(am);
6292
6293
return(ret);
6294
}
6295
6296
/**
6297
* xmlAutomataIsDeterminist:
6298
* @am: an automata
6299
*
6300
* Checks if an automata is determinist.
6301
*
6302
* Returns 1 if true, 0 if not, and -1 in case of error
6303
*/
6304
int
6305
xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6306
int ret;
6307
6308
if (am == NULL)
6309
return(-1);
6310
6311
ret = xmlFAComputesDeterminism(am);
6312
return(ret);
6313
}
6314
#endif /* LIBXML_AUTOMATA_ENABLED */
6315
6316
#ifdef LIBXML_EXPR_ENABLED
6317
/************************************************************************
6318
* *
6319
* Formal Expression handling code *
6320
* *
6321
************************************************************************/
6322
/************************************************************************
6323
* *
6324
* Expression handling context *
6325
* *
6326
************************************************************************/
6327
6328
struct _xmlExpCtxt {
6329
xmlDictPtr dict;
6330
xmlExpNodePtr *table;
6331
int size;
6332
int nbElems;
6333
int nb_nodes;
6334
int maxNodes;
6335
const char *expr;
6336
const char *cur;
6337
int nb_cons;
6338
int tabSize;
6339
};
6340
6341
/**
6342
* xmlExpNewCtxt:
6343
* @maxNodes: the maximum number of nodes
6344
* @dict: optional dictionary to use internally
6345
*
6346
* Creates a new context for manipulating expressions
6347
*
6348
* Returns the context or NULL in case of error
6349
*/
6350
xmlExpCtxtPtr
6351
xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6352
xmlExpCtxtPtr ret;
6353
int size = 256;
6354
6355
if (maxNodes <= 4096)
6356
maxNodes = 4096;
6357
6358
ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6359
if (ret == NULL)
6360
return(NULL);
6361
memset(ret, 0, sizeof(xmlExpCtxt));
6362
ret->size = size;
6363
ret->nbElems = 0;
6364
ret->maxNodes = maxNodes;
6365
ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6366
if (ret->table == NULL) {
6367
xmlFree(ret);
6368
return(NULL);
6369
}
6370
memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6371
if (dict == NULL) {
6372
ret->dict = xmlDictCreate();
6373
if (ret->dict == NULL) {
6374
xmlFree(ret->table);
6375
xmlFree(ret);
6376
return(NULL);
6377
}
6378
} else {
6379
ret->dict = dict;
6380
xmlDictReference(ret->dict);
6381
}
6382
return(ret);
6383
}
6384
6385
/**
6386
* xmlExpFreeCtxt:
6387
* @ctxt: an expression context
6388
*
6389
* Free an expression context
6390
*/
6391
void
6392
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6393
if (ctxt == NULL)
6394
return;
6395
xmlDictFree(ctxt->dict);
6396
if (ctxt->table != NULL)
6397
xmlFree(ctxt->table);
6398
xmlFree(ctxt);
6399
}
6400
6401
/************************************************************************
6402
* *
6403
* Structure associated to an expression node *
6404
* *
6405
************************************************************************/
6406
#define MAX_NODES 10000
6407
6408
/*
6409
* TODO:
6410
* - Wildcards
6411
* - public API for creation
6412
*
6413
* Started
6414
* - regression testing
6415
*
6416
* Done
6417
* - split into module and test tool
6418
* - memleaks
6419
*/
6420
6421
typedef enum {
6422
XML_EXP_NILABLE = (1 << 0)
6423
} xmlExpNodeInfo;
6424
6425
#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6426
6427
struct _xmlExpNode {
6428
unsigned char type;/* xmlExpNodeType */
6429
unsigned char info;/* OR of xmlExpNodeInfo */
6430
unsigned short key; /* the hash key */
6431
unsigned int ref; /* The number of references */
6432
int c_max; /* the maximum length it can consume */
6433
xmlExpNodePtr exp_left;
6434
xmlExpNodePtr next;/* the next node in the hash table or free list */
6435
union {
6436
struct {
6437
int f_min;
6438
int f_max;
6439
} count;
6440
struct {
6441
xmlExpNodePtr f_right;
6442
} children;
6443
const xmlChar *f_str;
6444
} field;
6445
};
6446
6447
#define exp_min field.count.f_min
6448
#define exp_max field.count.f_max
6449
/* #define exp_left field.children.f_left */
6450
#define exp_right field.children.f_right
6451
#define exp_str field.f_str
6452
6453
static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6454
static xmlExpNode forbiddenExpNode = {
6455
XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6456
};
6457
xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6458
static xmlExpNode emptyExpNode = {
6459
XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6460
};
6461
xmlExpNodePtr emptyExp = &emptyExpNode;
6462
6463
/************************************************************************
6464
* *
6465
* The custom hash table for unicity and canonicalization *
6466
* of sub-expressions pointers *
6467
* *
6468
************************************************************************/
6469
/*
6470
* xmlExpHashNameComputeKey:
6471
* Calculate the hash key for a token
6472
*/
6473
static unsigned short
6474
xmlExpHashNameComputeKey(const xmlChar *name) {
6475
unsigned short value = 0L;
6476
char ch;
6477
6478
if (name != NULL) {
6479
value += 30 * (*name);
6480
while ((ch = *name++) != 0) {
6481
value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6482
}
6483
}
6484
return (value);
6485
}
6486
6487
/*
6488
* xmlExpHashComputeKey:
6489
* Calculate the hash key for a compound expression
6490
*/
6491
static unsigned short
6492
xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6493
xmlExpNodePtr right) {
6494
unsigned long value;
6495
unsigned short ret;
6496
6497
switch (type) {
6498
case XML_EXP_SEQ:
6499
value = left->key;
6500
value += right->key;
6501
value *= 3;
6502
ret = (unsigned short) value;
6503
break;
6504
case XML_EXP_OR:
6505
value = left->key;
6506
value += right->key;
6507
value *= 7;
6508
ret = (unsigned short) value;
6509
break;
6510
case XML_EXP_COUNT:
6511
value = left->key;
6512
value += right->key;
6513
ret = (unsigned short) value;
6514
break;
6515
default:
6516
ret = 0;
6517
}
6518
return(ret);
6519
}
6520
6521
6522
static xmlExpNodePtr
6523
xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6524
xmlExpNodePtr ret;
6525
6526
if (ctxt->nb_nodes >= MAX_NODES)
6527
return(NULL);
6528
ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6529
if (ret == NULL)
6530
return(NULL);
6531
memset(ret, 0, sizeof(xmlExpNode));
6532
ret->type = type;
6533
ret->next = NULL;
6534
ctxt->nb_nodes++;
6535
ctxt->nb_cons++;
6536
return(ret);
6537
}
6538
6539
/**
6540
* xmlExpHashGetEntry:
6541
* @table: the hash table
6542
*
6543
* Get the unique entry from the hash table. The entry is created if
6544
* needed. @left and @right are consumed, i.e. their ref count will
6545
* be decremented by the operation.
6546
*
6547
* Returns the pointer or NULL in case of error
6548
*/
6549
static xmlExpNodePtr
6550
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6551
xmlExpNodePtr left, xmlExpNodePtr right,
6552
const xmlChar *name, int min, int max) {
6553
unsigned short kbase, key;
6554
xmlExpNodePtr entry;
6555
xmlExpNodePtr insert;
6556
6557
if (ctxt == NULL)
6558
return(NULL);
6559
6560
/*
6561
* Check for duplicate and insertion location.
6562
*/
6563
if (type == XML_EXP_ATOM) {
6564
kbase = xmlExpHashNameComputeKey(name);
6565
} else if (type == XML_EXP_COUNT) {
6566
/* COUNT reduction rule 1 */
6567
/* a{1} -> a */
6568
if (min == max) {
6569
if (min == 1) {
6570
return(left);
6571
}
6572
if (min == 0) {
6573
xmlExpFree(ctxt, left);
6574
return(emptyExp);
6575
}
6576
}
6577
if (min < 0) {
6578
xmlExpFree(ctxt, left);
6579
return(forbiddenExp);
6580
}
6581
if (max == -1)
6582
kbase = min + 79;
6583
else
6584
kbase = max - min;
6585
kbase += left->key;
6586
} else if (type == XML_EXP_OR) {
6587
/* Forbid reduction rules */
6588
if (left->type == XML_EXP_FORBID) {
6589
xmlExpFree(ctxt, left);
6590
return(right);
6591
}
6592
if (right->type == XML_EXP_FORBID) {
6593
xmlExpFree(ctxt, right);
6594
return(left);
6595
}
6596
6597
/* OR reduction rule 1 */
6598
/* a | a reduced to a */
6599
if (left == right) {
6600
xmlExpFree(ctxt, right);
6601
return(left);
6602
}
6603
/* OR canonicalization rule 1 */
6604
/* linearize (a | b) | c into a | (b | c) */
6605
if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6606
xmlExpNodePtr tmp = left;
6607
left = right;
6608
right = tmp;
6609
}
6610
/* OR reduction rule 2 */
6611
/* a | (a | b) and b | (a | b) are reduced to a | b */
6612
if (right->type == XML_EXP_OR) {
6613
if ((left == right->exp_left) ||
6614
(left == right->exp_right)) {
6615
xmlExpFree(ctxt, left);
6616
return(right);
6617
}
6618
}
6619
/* OR canonicalization rule 2 */
6620
/* linearize (a | b) | c into a | (b | c) */
6621
if (left->type == XML_EXP_OR) {
6622
xmlExpNodePtr tmp;
6623
6624
/* OR canonicalization rule 2 */
6625
if ((left->exp_right->type != XML_EXP_OR) &&
6626
(left->exp_right->key < left->exp_left->key)) {
6627
tmp = left->exp_right;
6628
left->exp_right = left->exp_left;
6629
left->exp_left = tmp;
6630
}
6631
left->exp_right->ref++;
6632
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6633
NULL, 0, 0);
6634
left->exp_left->ref++;
6635
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6636
NULL, 0, 0);
6637
6638
xmlExpFree(ctxt, left);
6639
return(tmp);
6640
}
6641
if (right->type == XML_EXP_OR) {
6642
/* Ordering in the tree */
6643
/* C | (A | B) -> A | (B | C) */
6644
if (left->key > right->exp_right->key) {
6645
xmlExpNodePtr tmp;
6646
right->exp_right->ref++;
6647
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6648
left, NULL, 0, 0);
6649
right->exp_left->ref++;
6650
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6651
tmp, NULL, 0, 0);
6652
xmlExpFree(ctxt, right);
6653
return(tmp);
6654
}
6655
/* Ordering in the tree */
6656
/* B | (A | C) -> A | (B | C) */
6657
if (left->key > right->exp_left->key) {
6658
xmlExpNodePtr tmp;
6659
right->exp_right->ref++;
6660
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6661
right->exp_right, NULL, 0, 0);
6662
right->exp_left->ref++;
6663
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6664
tmp, NULL, 0, 0);
6665
xmlExpFree(ctxt, right);
6666
return(tmp);
6667
}
6668
}
6669
/* we know both types are != XML_EXP_OR here */
6670
else if (left->key > right->key) {
6671
xmlExpNodePtr tmp = left;
6672
left = right;
6673
right = tmp;
6674
}
6675
kbase = xmlExpHashComputeKey(type, left, right);
6676
} else if (type == XML_EXP_SEQ) {
6677
/* Forbid reduction rules */
6678
if (left->type == XML_EXP_FORBID) {
6679
xmlExpFree(ctxt, right);
6680
return(left);
6681
}
6682
if (right->type == XML_EXP_FORBID) {
6683
xmlExpFree(ctxt, left);
6684
return(right);
6685
}
6686
/* Empty reduction rules */
6687
if (right->type == XML_EXP_EMPTY) {
6688
return(left);
6689
}
6690
if (left->type == XML_EXP_EMPTY) {
6691
return(right);
6692
}
6693
kbase = xmlExpHashComputeKey(type, left, right);
6694
} else
6695
return(NULL);
6696
6697
key = kbase % ctxt->size;
6698
if (ctxt->table[key] != NULL) {
6699
for (insert = ctxt->table[key]; insert != NULL;
6700
insert = insert->next) {
6701
if ((insert->key == kbase) &&
6702
(insert->type == type)) {
6703
if (type == XML_EXP_ATOM) {
6704
if (name == insert->exp_str) {
6705
insert->ref++;
6706
return(insert);
6707
}
6708
} else if (type == XML_EXP_COUNT) {
6709
if ((insert->exp_min == min) && (insert->exp_max == max) &&
6710
(insert->exp_left == left)) {
6711
insert->ref++;
6712
left->ref--;
6713
return(insert);
6714
}
6715
} else if ((insert->exp_left == left) &&
6716
(insert->exp_right == right)) {
6717
insert->ref++;
6718
left->ref--;
6719
right->ref--;
6720
return(insert);
6721
}
6722
}
6723
}
6724
}
6725
6726
entry = xmlExpNewNode(ctxt, type);
6727
if (entry == NULL)
6728
return(NULL);
6729
entry->key = kbase;
6730
if (type == XML_EXP_ATOM) {
6731
entry->exp_str = name;
6732
entry->c_max = 1;
6733
} else if (type == XML_EXP_COUNT) {
6734
entry->exp_min = min;
6735
entry->exp_max = max;
6736
entry->exp_left = left;
6737
if ((min == 0) || (IS_NILLABLE(left)))
6738
entry->info |= XML_EXP_NILABLE;
6739
if (max < 0)
6740
entry->c_max = -1;
6741
else
6742
entry->c_max = max * entry->exp_left->c_max;
6743
} else {
6744
entry->exp_left = left;
6745
entry->exp_right = right;
6746
if (type == XML_EXP_OR) {
6747
if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6748
entry->info |= XML_EXP_NILABLE;
6749
if ((entry->exp_left->c_max == -1) ||
6750
(entry->exp_right->c_max == -1))
6751
entry->c_max = -1;
6752
else if (entry->exp_left->c_max > entry->exp_right->c_max)
6753
entry->c_max = entry->exp_left->c_max;
6754
else
6755
entry->c_max = entry->exp_right->c_max;
6756
} else {
6757
if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6758
entry->info |= XML_EXP_NILABLE;
6759
if ((entry->exp_left->c_max == -1) ||
6760
(entry->exp_right->c_max == -1))
6761
entry->c_max = -1;
6762
else
6763
entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6764
}
6765
}
6766
entry->ref = 1;
6767
if (ctxt->table[key] != NULL)
6768
entry->next = ctxt->table[key];
6769
6770
ctxt->table[key] = entry;
6771
ctxt->nbElems++;
6772
6773
return(entry);
6774
}
6775
6776
/**
6777
* xmlExpFree:
6778
* @ctxt: the expression context
6779
* @exp: the expression
6780
*
6781
* Dereference the expression
6782
*/
6783
void
6784
xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6785
if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6786
return;
6787
exp->ref--;
6788
if (exp->ref == 0) {
6789
unsigned short key;
6790
6791
/* Unlink it first from the hash table */
6792
key = exp->key % ctxt->size;
6793
if (ctxt->table[key] == exp) {
6794
ctxt->table[key] = exp->next;
6795
} else {
6796
xmlExpNodePtr tmp;
6797
6798
tmp = ctxt->table[key];
6799
while (tmp != NULL) {
6800
if (tmp->next == exp) {
6801
tmp->next = exp->next;
6802
break;
6803
}
6804
tmp = tmp->next;
6805
}
6806
}
6807
6808
if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6809
xmlExpFree(ctxt, exp->exp_left);
6810
xmlExpFree(ctxt, exp->exp_right);
6811
} else if (exp->type == XML_EXP_COUNT) {
6812
xmlExpFree(ctxt, exp->exp_left);
6813
}
6814
xmlFree(exp);
6815
ctxt->nb_nodes--;
6816
}
6817
}
6818
6819
/**
6820
* xmlExpRef:
6821
* @exp: the expression
6822
*
6823
* Increase the reference count of the expression
6824
*/
6825
void
6826
xmlExpRef(xmlExpNodePtr exp) {
6827
if (exp != NULL)
6828
exp->ref++;
6829
}
6830
6831
/**
6832
* xmlExpNewAtom:
6833
* @ctxt: the expression context
6834
* @name: the atom name
6835
* @len: the atom name length in byte (or -1);
6836
*
6837
* Get the atom associated to this name from that context
6838
*
6839
* Returns the node or NULL in case of error
6840
*/
6841
xmlExpNodePtr
6842
xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6843
if ((ctxt == NULL) || (name == NULL))
6844
return(NULL);
6845
name = xmlDictLookup(ctxt->dict, name, len);
6846
if (name == NULL)
6847
return(NULL);
6848
return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6849
}
6850
6851
/**
6852
* xmlExpNewOr:
6853
* @ctxt: the expression context
6854
* @left: left expression
6855
* @right: right expression
6856
*
6857
* Get the atom associated to the choice @left | @right
6858
* Note that @left and @right are consumed in the operation, to keep
6859
* an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6860
* this is true even in case of failure (unless ctxt == NULL).
6861
*
6862
* Returns the node or NULL in case of error
6863
*/
6864
xmlExpNodePtr
6865
xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6866
if (ctxt == NULL)
6867
return(NULL);
6868
if ((left == NULL) || (right == NULL)) {
6869
xmlExpFree(ctxt, left);
6870
xmlExpFree(ctxt, right);
6871
return(NULL);
6872
}
6873
return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6874
}
6875
6876
/**
6877
* xmlExpNewSeq:
6878
* @ctxt: the expression context
6879
* @left: left expression
6880
* @right: right expression
6881
*
6882
* Get the atom associated to the sequence @left , @right
6883
* Note that @left and @right are consumed in the operation, to keep
6884
* an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6885
* this is true even in case of failure (unless ctxt == NULL).
6886
*
6887
* Returns the node or NULL in case of error
6888
*/
6889
xmlExpNodePtr
6890
xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6891
if (ctxt == NULL)
6892
return(NULL);
6893
if ((left == NULL) || (right == NULL)) {
6894
xmlExpFree(ctxt, left);
6895
xmlExpFree(ctxt, right);
6896
return(NULL);
6897
}
6898
return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6899
}
6900
6901
/**
6902
* xmlExpNewRange:
6903
* @ctxt: the expression context
6904
* @subset: the expression to be repeated
6905
* @min: the lower bound for the repetition
6906
* @max: the upper bound for the repetition, -1 means infinite
6907
*
6908
* Get the atom associated to the range (@subset){@min, @max}
6909
* Note that @subset is consumed in the operation, to keep
6910
* an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6911
* this is true even in case of failure (unless ctxt == NULL).
6912
*
6913
* Returns the node or NULL in case of error
6914
*/
6915
xmlExpNodePtr
6916
xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6917
if (ctxt == NULL)
6918
return(NULL);
6919
if ((subset == NULL) || (min < 0) || (max < -1) ||
6920
((max >= 0) && (min > max))) {
6921
xmlExpFree(ctxt, subset);
6922
return(NULL);
6923
}
6924
return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6925
NULL, NULL, min, max));
6926
}
6927
6928
/************************************************************************
6929
* *
6930
* Public API for operations on expressions *
6931
* *
6932
************************************************************************/
6933
6934
static int
6935
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6936
const xmlChar**list, int len, int nb) {
6937
int tmp, tmp2;
6938
tail:
6939
switch (exp->type) {
6940
case XML_EXP_EMPTY:
6941
return(0);
6942
case XML_EXP_ATOM:
6943
for (tmp = 0;tmp < nb;tmp++)
6944
if (list[tmp] == exp->exp_str)
6945
return(0);
6946
if (nb >= len)
6947
return(-2);
6948
list[nb] = exp->exp_str;
6949
return(1);
6950
case XML_EXP_COUNT:
6951
exp = exp->exp_left;
6952
goto tail;
6953
case XML_EXP_SEQ:
6954
case XML_EXP_OR:
6955
tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6956
if (tmp < 0)
6957
return(tmp);
6958
tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6959
nb + tmp);
6960
if (tmp2 < 0)
6961
return(tmp2);
6962
return(tmp + tmp2);
6963
}
6964
return(-1);
6965
}
6966
6967
/**
6968
* xmlExpGetLanguage:
6969
* @ctxt: the expression context
6970
* @exp: the expression
6971
* @langList: where to store the tokens
6972
* @len: the allocated length of @list
6973
*
6974
* Find all the strings used in @exp and store them in @list
6975
*
6976
* Returns the number of unique strings found, -1 in case of errors and
6977
* -2 if there is more than @len strings
6978
*/
6979
int
6980
xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6981
const xmlChar**langList, int len) {
6982
if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
6983
return(-1);
6984
return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
6985
}
6986
6987
static int
6988
xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6989
const xmlChar**list, int len, int nb) {
6990
int tmp, tmp2;
6991
tail:
6992
switch (exp->type) {
6993
case XML_EXP_FORBID:
6994
return(0);
6995
case XML_EXP_EMPTY:
6996
return(0);
6997
case XML_EXP_ATOM:
6998
for (tmp = 0;tmp < nb;tmp++)
6999
if (list[tmp] == exp->exp_str)
7000
return(0);
7001
if (nb >= len)
7002
return(-2);
7003
list[nb] = exp->exp_str;
7004
return(1);
7005
case XML_EXP_COUNT:
7006
exp = exp->exp_left;
7007
goto tail;
7008
case XML_EXP_SEQ:
7009
tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7010
if (tmp < 0)
7011
return(tmp);
7012
if (IS_NILLABLE(exp->exp_left)) {
7013
tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7014
nb + tmp);
7015
if (tmp2 < 0)
7016
return(tmp2);
7017
tmp += tmp2;
7018
}
7019
return(tmp);
7020
case XML_EXP_OR:
7021
tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7022
if (tmp < 0)
7023
return(tmp);
7024
tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7025
nb + tmp);
7026
if (tmp2 < 0)
7027
return(tmp2);
7028
return(tmp + tmp2);
7029
}
7030
return(-1);
7031
}
7032
7033
/**
7034
* xmlExpGetStart:
7035
* @ctxt: the expression context
7036
* @exp: the expression
7037
* @tokList: where to store the tokens
7038
* @len: the allocated length of @list
7039
*
7040
* Find all the strings that appears at the start of the languages
7041
* accepted by @exp and store them in @list. E.g. for (a, b) | c
7042
* it will return the list [a, c]
7043
*
7044
* Returns the number of unique strings found, -1 in case of errors and
7045
* -2 if there is more than @len strings
7046
*/
7047
int
7048
xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7049
const xmlChar**tokList, int len) {
7050
if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7051
return(-1);
7052
return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7053
}
7054
7055
/**
7056
* xmlExpIsNillable:
7057
* @exp: the expression
7058
*
7059
* Finds if the expression is nillable, i.e. if it accepts the empty sequence
7060
*
7061
* Returns 1 if nillable, 0 if not and -1 in case of error
7062
*/
7063
int
7064
xmlExpIsNillable(xmlExpNodePtr exp) {
7065
if (exp == NULL)
7066
return(-1);
7067
return(IS_NILLABLE(exp) != 0);
7068
}
7069
7070
static xmlExpNodePtr
7071
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7072
{
7073
xmlExpNodePtr ret;
7074
7075
switch (exp->type) {
7076
case XML_EXP_EMPTY:
7077
return(forbiddenExp);
7078
case XML_EXP_FORBID:
7079
return(forbiddenExp);
7080
case XML_EXP_ATOM:
7081
if (exp->exp_str == str) {
7082
ret = emptyExp;
7083
} else {
7084
/* TODO wildcards here */
7085
ret = forbiddenExp;
7086
}
7087
return(ret);
7088
case XML_EXP_OR: {
7089
xmlExpNodePtr tmp;
7090
7091
tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7092
if (tmp == NULL) {
7093
return(NULL);
7094
}
7095
ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7096
if (ret == NULL) {
7097
xmlExpFree(ctxt, tmp);
7098
return(NULL);
7099
}
7100
ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7101
NULL, 0, 0);
7102
return(ret);
7103
}
7104
case XML_EXP_SEQ:
7105
ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7106
if (ret == NULL) {
7107
return(NULL);
7108
} else if (ret == forbiddenExp) {
7109
if (IS_NILLABLE(exp->exp_left)) {
7110
ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7111
}
7112
} else {
7113
exp->exp_right->ref++;
7114
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7115
NULL, 0, 0);
7116
}
7117
return(ret);
7118
case XML_EXP_COUNT: {
7119
int min, max;
7120
xmlExpNodePtr tmp;
7121
7122
if (exp->exp_max == 0)
7123
return(forbiddenExp);
7124
ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7125
if (ret == NULL)
7126
return(NULL);
7127
if (ret == forbiddenExp) {
7128
return(ret);
7129
}
7130
if (exp->exp_max == 1)
7131
return(ret);
7132
if (exp->exp_max < 0) /* unbounded */
7133
max = -1;
7134
else
7135
max = exp->exp_max - 1;
7136
if (exp->exp_min > 0)
7137
min = exp->exp_min - 1;
7138
else
7139
min = 0;
7140
exp->exp_left->ref++;
7141
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7142
NULL, min, max);
7143
if (ret == emptyExp) {
7144
return(tmp);
7145
}
7146
return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7147
NULL, 0, 0));
7148
}
7149
}
7150
return(NULL);
7151
}
7152
7153
/**
7154
* xmlExpStringDerive:
7155
* @ctxt: the expression context
7156
* @exp: the expression
7157
* @str: the string
7158
* @len: the string len in bytes if available
7159
*
7160
* Do one step of Brzozowski derivation of the expression @exp with
7161
* respect to the input string
7162
*
7163
* Returns the resulting expression or NULL in case of internal error
7164
*/
7165
xmlExpNodePtr
7166
xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7167
const xmlChar *str, int len) {
7168
const xmlChar *input;
7169
7170
if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7171
return(NULL);
7172
}
7173
/*
7174
* check the string is in the dictionary, if yes use an interned
7175
* copy, otherwise we know it's not an acceptable input
7176
*/
7177
input = xmlDictExists(ctxt->dict, str, len);
7178
if (input == NULL) {
7179
return(forbiddenExp);
7180
}
7181
return(xmlExpStringDeriveInt(ctxt, exp, input));
7182
}
7183
7184
static int
7185
xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7186
int ret = 1;
7187
7188
if (sub->c_max == -1) {
7189
if (exp->c_max != -1)
7190
ret = 0;
7191
} else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7192
ret = 0;
7193
}
7194
#if 0
7195
if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7196
ret = 0;
7197
#endif
7198
return(ret);
7199
}
7200
7201
static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7202
xmlExpNodePtr sub);
7203
/**
7204
* xmlExpDivide:
7205
* @ctxt: the expressions context
7206
* @exp: the englobing expression
7207
* @sub: the subexpression
7208
* @mult: the multiple expression
7209
* @remain: the remain from the derivation of the multiple
7210
*
7211
* Check if exp is a multiple of sub, i.e. if there is a finite number n
7212
* so that sub{n} subsume exp
7213
*
7214
* Returns the multiple value if successful, 0 if it is not a multiple
7215
* and -1 in case of internal error.
7216
*/
7217
7218
static int
7219
xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7220
xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7221
int i;
7222
xmlExpNodePtr tmp, tmp2;
7223
7224
if (mult != NULL) *mult = NULL;
7225
if (remain != NULL) *remain = NULL;
7226
if (exp->c_max == -1) return(0);
7227
if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7228
7229
for (i = 1;i <= exp->c_max;i++) {
7230
sub->ref++;
7231
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7232
sub, NULL, NULL, i, i);
7233
if (tmp == NULL) {
7234
return(-1);
7235
}
7236
if (!xmlExpCheckCard(tmp, exp)) {
7237
xmlExpFree(ctxt, tmp);
7238
continue;
7239
}
7240
tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7241
if (tmp2 == NULL) {
7242
xmlExpFree(ctxt, tmp);
7243
return(-1);
7244
}
7245
if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7246
if (remain != NULL)
7247
*remain = tmp2;
7248
else
7249
xmlExpFree(ctxt, tmp2);
7250
if (mult != NULL)
7251
*mult = tmp;
7252
else
7253
xmlExpFree(ctxt, tmp);
7254
return(i);
7255
}
7256
xmlExpFree(ctxt, tmp);
7257
xmlExpFree(ctxt, tmp2);
7258
}
7259
return(0);
7260
}
7261
7262
/**
7263
* xmlExpExpDeriveInt:
7264
* @ctxt: the expressions context
7265
* @exp: the englobing expression
7266
* @sub: the subexpression
7267
*
7268
* Try to do a step of Brzozowski derivation but at a higher level
7269
* the input being a subexpression.
7270
*
7271
* Returns the resulting expression or NULL in case of internal error
7272
*/
7273
static xmlExpNodePtr
7274
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7275
xmlExpNodePtr ret, tmp, tmp2, tmp3;
7276
const xmlChar **tab;
7277
int len, i;
7278
7279
/*
7280
* In case of equality and if the expression can only consume a finite
7281
* amount, then the derivation is empty
7282
*/
7283
if ((exp == sub) && (exp->c_max >= 0)) {
7284
return(emptyExp);
7285
}
7286
/*
7287
* decompose sub sequence first
7288
*/
7289
if (sub->type == XML_EXP_EMPTY) {
7290
exp->ref++;
7291
return(exp);
7292
}
7293
if (sub->type == XML_EXP_SEQ) {
7294
tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7295
if (tmp == NULL)
7296
return(NULL);
7297
if (tmp == forbiddenExp)
7298
return(tmp);
7299
ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7300
xmlExpFree(ctxt, tmp);
7301
return(ret);
7302
}
7303
if (sub->type == XML_EXP_OR) {
7304
tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7305
if (tmp == forbiddenExp)
7306
return(tmp);
7307
if (tmp == NULL)
7308
return(NULL);
7309
ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7310
if ((ret == NULL) || (ret == forbiddenExp)) {
7311
xmlExpFree(ctxt, tmp);
7312
return(ret);
7313
}
7314
return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7315
}
7316
if (!xmlExpCheckCard(exp, sub)) {
7317
return(forbiddenExp);
7318
}
7319
switch (exp->type) {
7320
case XML_EXP_EMPTY:
7321
if (sub == emptyExp)
7322
return(emptyExp);
7323
return(forbiddenExp);
7324
case XML_EXP_FORBID:
7325
return(forbiddenExp);
7326
case XML_EXP_ATOM:
7327
if (sub->type == XML_EXP_ATOM) {
7328
/* TODO: handle wildcards */
7329
if (exp->exp_str == sub->exp_str) {
7330
return(emptyExp);
7331
}
7332
return(forbiddenExp);
7333
}
7334
if ((sub->type == XML_EXP_COUNT) &&
7335
(sub->exp_max == 1) &&
7336
(sub->exp_left->type == XML_EXP_ATOM)) {
7337
/* TODO: handle wildcards */
7338
if (exp->exp_str == sub->exp_left->exp_str) {
7339
return(emptyExp);
7340
}
7341
return(forbiddenExp);
7342
}
7343
return(forbiddenExp);
7344
case XML_EXP_SEQ:
7345
/* try to get the sequence consumed only if possible */
7346
if (xmlExpCheckCard(exp->exp_left, sub)) {
7347
/* See if the sequence can be consumed directly */
7348
ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7349
if ((ret != forbiddenExp) && (ret != NULL)) {
7350
/*
7351
* TODO: assumption here that we are determinist
7352
* i.e. we won't get to a nillable exp left
7353
* subset which could be matched by the right
7354
* part too.
7355
* e.g.: (a | b)+,(a | c) and 'a+,a'
7356
*/
7357
exp->exp_right->ref++;
7358
return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7359
exp->exp_right, NULL, 0, 0));
7360
}
7361
}
7362
/* Try instead to decompose */
7363
if (sub->type == XML_EXP_COUNT) {
7364
int min, max;
7365
7366
ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7367
if (ret == NULL)
7368
return(NULL);
7369
if (ret != forbiddenExp) {
7370
if (sub->exp_max < 0)
7371
max = -1;
7372
else
7373
max = sub->exp_max -1;
7374
if (sub->exp_min > 0)
7375
min = sub->exp_min -1;
7376
else
7377
min = 0;
7378
exp->exp_right->ref++;
7379
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7380
exp->exp_right, NULL, 0, 0);
7381
if (tmp == NULL)
7382
return(NULL);
7383
7384
sub->exp_left->ref++;
7385
tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7386
sub->exp_left, NULL, NULL, min, max);
7387
if (tmp2 == NULL) {
7388
xmlExpFree(ctxt, tmp);
7389
return(NULL);
7390
}
7391
ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7392
xmlExpFree(ctxt, tmp);
7393
xmlExpFree(ctxt, tmp2);
7394
return(ret);
7395
}
7396
}
7397
/* we made no progress on structured operations */
7398
break;
7399
case XML_EXP_OR:
7400
ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7401
if (ret == NULL)
7402
return(NULL);
7403
tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7404
if (tmp == NULL) {
7405
xmlExpFree(ctxt, ret);
7406
return(NULL);
7407
}
7408
return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7409
case XML_EXP_COUNT: {
7410
int min, max;
7411
7412
if (sub->type == XML_EXP_COUNT) {
7413
/*
7414
* Try to see if the loop is completely subsumed
7415
*/
7416
tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7417
if (tmp == NULL)
7418
return(NULL);
7419
if (tmp == forbiddenExp) {
7420
int mult;
7421
7422
mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7423
NULL, &tmp);
7424
if (mult <= 0) {
7425
return(forbiddenExp);
7426
}
7427
if (sub->exp_max == -1) {
7428
max = -1;
7429
if (exp->exp_max == -1) {
7430
if (exp->exp_min <= sub->exp_min * mult)
7431
min = 0;
7432
else
7433
min = exp->exp_min - sub->exp_min * mult;
7434
} else {
7435
xmlExpFree(ctxt, tmp);
7436
return(forbiddenExp);
7437
}
7438
} else {
7439
if (exp->exp_max == -1) {
7440
if (exp->exp_min > sub->exp_min * mult) {
7441
max = -1;
7442
min = exp->exp_min - sub->exp_min * mult;
7443
} else {
7444
max = -1;
7445
min = 0;
7446
}
7447
} else {
7448
if (exp->exp_max < sub->exp_max * mult) {
7449
xmlExpFree(ctxt, tmp);
7450
return(forbiddenExp);
7451
}
7452
if (sub->exp_max * mult > exp->exp_min)
7453
min = 0;
7454
else
7455
min = exp->exp_min - sub->exp_max * mult;
7456
max = exp->exp_max - sub->exp_max * mult;
7457
}
7458
}
7459
} else if (!IS_NILLABLE(tmp)) {
7460
/*
7461
* TODO: loop here to try to grow if working on finite
7462
* blocks.
7463
*/
7464
xmlExpFree(ctxt, tmp);
7465
return(forbiddenExp);
7466
} else if (sub->exp_max == -1) {
7467
if (exp->exp_max == -1) {
7468
if (exp->exp_min <= sub->exp_min) {
7469
max = -1;
7470
min = 0;
7471
} else {
7472
max = -1;
7473
min = exp->exp_min - sub->exp_min;
7474
}
7475
} else if (exp->exp_min > sub->exp_min) {
7476
xmlExpFree(ctxt, tmp);
7477
return(forbiddenExp);
7478
} else {
7479
max = -1;
7480
min = 0;
7481
}
7482
} else {
7483
if (exp->exp_max == -1) {
7484
if (exp->exp_min > sub->exp_min) {
7485
max = -1;
7486
min = exp->exp_min - sub->exp_min;
7487
} else {
7488
max = -1;
7489
min = 0;
7490
}
7491
} else {
7492
if (exp->exp_max < sub->exp_max) {
7493
xmlExpFree(ctxt, tmp);
7494
return(forbiddenExp);
7495
}
7496
if (sub->exp_max > exp->exp_min)
7497
min = 0;
7498
else
7499
min = exp->exp_min - sub->exp_max;
7500
max = exp->exp_max - sub->exp_max;
7501
}
7502
}
7503
exp->exp_left->ref++;
7504
tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7505
NULL, NULL, min, max);
7506
if (tmp2 == NULL) {
7507
return(NULL);
7508
}
7509
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7510
NULL, 0, 0);
7511
return(ret);
7512
}
7513
tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7514
if (tmp == NULL)
7515
return(NULL);
7516
if (tmp == forbiddenExp) {
7517
return(forbiddenExp);
7518
}
7519
if (exp->exp_min > 0)
7520
min = exp->exp_min - 1;
7521
else
7522
min = 0;
7523
if (exp->exp_max < 0)
7524
max = -1;
7525
else
7526
max = exp->exp_max - 1;
7527
7528
exp->exp_left->ref++;
7529
tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7530
NULL, NULL, min, max);
7531
if (tmp2 == NULL)
7532
return(NULL);
7533
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7534
NULL, 0, 0);
7535
return(ret);
7536
}
7537
}
7538
7539
if (IS_NILLABLE(sub)) {
7540
if (!(IS_NILLABLE(exp)))
7541
return(forbiddenExp);
7542
else
7543
ret = emptyExp;
7544
} else
7545
ret = NULL;
7546
/*
7547
* here the structured derivation made no progress so
7548
* we use the default token based derivation to force one more step
7549
*/
7550
if (ctxt->tabSize == 0)
7551
ctxt->tabSize = 40;
7552
7553
tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7554
sizeof(const xmlChar *));
7555
if (tab == NULL) {
7556
return(NULL);
7557
}
7558
7559
/*
7560
* collect all the strings accepted by the subexpression on input
7561
*/
7562
len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7563
while (len < 0) {
7564
const xmlChar **temp;
7565
temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7566
sizeof(const xmlChar *));
7567
if (temp == NULL) {
7568
xmlFree((xmlChar **) tab);
7569
return(NULL);
7570
}
7571
tab = temp;
7572
ctxt->tabSize *= 2;
7573
len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7574
}
7575
for (i = 0;i < len;i++) {
7576
tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7577
if ((tmp == NULL) || (tmp == forbiddenExp)) {
7578
xmlExpFree(ctxt, ret);
7579
xmlFree((xmlChar **) tab);
7580
return(tmp);
7581
}
7582
tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7583
if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7584
xmlExpFree(ctxt, tmp);
7585
xmlExpFree(ctxt, ret);
7586
xmlFree((xmlChar **) tab);
7587
return(tmp);
7588
}
7589
tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7590
xmlExpFree(ctxt, tmp);
7591
xmlExpFree(ctxt, tmp2);
7592
7593
if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7594
xmlExpFree(ctxt, ret);
7595
xmlFree((xmlChar **) tab);
7596
return(tmp3);
7597
}
7598
7599
if (ret == NULL)
7600
ret = tmp3;
7601
else {
7602
ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7603
if (ret == NULL) {
7604
xmlFree((xmlChar **) tab);
7605
return(NULL);
7606
}
7607
}
7608
}
7609
xmlFree((xmlChar **) tab);
7610
return(ret);
7611
}
7612
7613
/**
7614
* xmlExpExpDerive:
7615
* @ctxt: the expressions context
7616
* @exp: the englobing expression
7617
* @sub: the subexpression
7618
*
7619
* Evaluates the expression resulting from @exp consuming a sub expression @sub
7620
* Based on algebraic derivation and sometimes direct Brzozowski derivation
7621
* it usually takes less than linear time and can handle expressions generating
7622
* infinite languages.
7623
*
7624
* Returns the resulting expression or NULL in case of internal error, the
7625
* result must be freed
7626
*/
7627
xmlExpNodePtr
7628
xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7629
if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7630
return(NULL);
7631
7632
/*
7633
* O(1) speedups
7634
*/
7635
if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7636
return(forbiddenExp);
7637
}
7638
if (xmlExpCheckCard(exp, sub) == 0) {
7639
return(forbiddenExp);
7640
}
7641
return(xmlExpExpDeriveInt(ctxt, exp, sub));
7642
}
7643
7644
/**
7645
* xmlExpSubsume:
7646
* @ctxt: the expressions context
7647
* @exp: the englobing expression
7648
* @sub: the subexpression
7649
*
7650
* Check whether @exp accepts all the languages accepted by @sub
7651
* the input being a subexpression.
7652
*
7653
* Returns 1 if true 0 if false and -1 in case of failure.
7654
*/
7655
int
7656
xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7657
xmlExpNodePtr tmp;
7658
7659
if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7660
return(-1);
7661
7662
/*
7663
* TODO: speedup by checking the language of sub is a subset of the
7664
* language of exp
7665
*/
7666
/*
7667
* O(1) speedups
7668
*/
7669
if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7670
return(0);
7671
}
7672
if (xmlExpCheckCard(exp, sub) == 0) {
7673
return(0);
7674
}
7675
tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7676
if (tmp == NULL)
7677
return(-1);
7678
if (tmp == forbiddenExp)
7679
return(0);
7680
if (tmp == emptyExp)
7681
return(1);
7682
if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7683
xmlExpFree(ctxt, tmp);
7684
return(1);
7685
}
7686
xmlExpFree(ctxt, tmp);
7687
return(0);
7688
}
7689
7690
/************************************************************************
7691
* *
7692
* Parsing expression *
7693
* *
7694
************************************************************************/
7695
7696
static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7697
7698
#undef CUR
7699
#define CUR (*ctxt->cur)
7700
#undef NEXT
7701
#define NEXT ctxt->cur++;
7702
#undef IS_BLANK
7703
#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7704
#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7705
7706
static int
7707
xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7708
int ret = 0;
7709
7710
SKIP_BLANKS
7711
if (CUR == '*') {
7712
NEXT
7713
return(-1);
7714
}
7715
if ((CUR < '0') || (CUR > '9'))
7716
return(-1);
7717
while ((CUR >= '0') && (CUR <= '9')) {
7718
ret = ret * 10 + (CUR - '0');
7719
NEXT
7720
}
7721
return(ret);
7722
}
7723
7724
static xmlExpNodePtr
7725
xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7726
const char *base;
7727
xmlExpNodePtr ret;
7728
const xmlChar *val;
7729
7730
SKIP_BLANKS
7731
base = ctxt->cur;
7732
if (*ctxt->cur == '(') {
7733
NEXT
7734
ret = xmlExpParseExpr(ctxt);
7735
SKIP_BLANKS
7736
if (*ctxt->cur != ')') {
7737
fprintf(stderr, "unbalanced '(' : %s\n", base);
7738
xmlExpFree(ctxt, ret);
7739
return(NULL);
7740
}
7741
NEXT;
7742
SKIP_BLANKS
7743
goto parse_quantifier;
7744
}
7745
while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7746
(CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7747
(CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7748
NEXT;
7749
val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7750
if (val == NULL)
7751
return(NULL);
7752
ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7753
if (ret == NULL)
7754
return(NULL);
7755
SKIP_BLANKS
7756
parse_quantifier:
7757
if (CUR == '{') {
7758
int min, max;
7759
7760
NEXT
7761
min = xmlExpParseNumber(ctxt);
7762
if (min < 0) {
7763
xmlExpFree(ctxt, ret);
7764
return(NULL);
7765
}
7766
SKIP_BLANKS
7767
if (CUR == ',') {
7768
NEXT
7769
max = xmlExpParseNumber(ctxt);
7770
SKIP_BLANKS
7771
} else
7772
max = min;
7773
if (CUR != '}') {
7774
xmlExpFree(ctxt, ret);
7775
return(NULL);
7776
}
7777
NEXT
7778
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7779
min, max);
7780
SKIP_BLANKS
7781
} else if (CUR == '?') {
7782
NEXT
7783
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7784
0, 1);
7785
SKIP_BLANKS
7786
} else if (CUR == '+') {
7787
NEXT
7788
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7789
1, -1);
7790
SKIP_BLANKS
7791
} else if (CUR == '*') {
7792
NEXT
7793
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7794
0, -1);
7795
SKIP_BLANKS
7796
}
7797
return(ret);
7798
}
7799
7800
7801
static xmlExpNodePtr
7802
xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7803
xmlExpNodePtr ret, right;
7804
7805
ret = xmlExpParseOr(ctxt);
7806
SKIP_BLANKS
7807
while (CUR == '|') {
7808
NEXT
7809
right = xmlExpParseOr(ctxt);
7810
if (right == NULL) {
7811
xmlExpFree(ctxt, ret);
7812
return(NULL);
7813
}
7814
ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7815
if (ret == NULL)
7816
return(NULL);
7817
}
7818
return(ret);
7819
}
7820
7821
static xmlExpNodePtr
7822
xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7823
xmlExpNodePtr ret, right;
7824
7825
ret = xmlExpParseSeq(ctxt);
7826
SKIP_BLANKS
7827
while (CUR == ',') {
7828
NEXT
7829
right = xmlExpParseSeq(ctxt);
7830
if (right == NULL) {
7831
xmlExpFree(ctxt, ret);
7832
return(NULL);
7833
}
7834
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7835
if (ret == NULL)
7836
return(NULL);
7837
}
7838
return(ret);
7839
}
7840
7841
/**
7842
* xmlExpParse:
7843
* @ctxt: the expressions context
7844
* @expr: the 0 terminated string
7845
*
7846
* Minimal parser for regexps, it understand the following constructs
7847
* - string terminals
7848
* - choice operator |
7849
* - sequence operator ,
7850
* - subexpressions (...)
7851
* - usual cardinality operators + * and ?
7852
* - finite sequences { min, max }
7853
* - infinite sequences { min, * }
7854
* There is minimal checkings made especially no checking on strings values
7855
*
7856
* Returns a new expression or NULL in case of failure
7857
*/
7858
xmlExpNodePtr
7859
xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
7860
xmlExpNodePtr ret;
7861
7862
ctxt->expr = expr;
7863
ctxt->cur = expr;
7864
7865
ret = xmlExpParseExpr(ctxt);
7866
SKIP_BLANKS
7867
if (*ctxt->cur != 0) {
7868
xmlExpFree(ctxt, ret);
7869
return(NULL);
7870
}
7871
return(ret);
7872
}
7873
7874
static void
7875
xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
7876
xmlExpNodePtr c;
7877
7878
if (expr == NULL) return;
7879
if (glob) xmlBufferWriteChar(buf, "(");
7880
switch (expr->type) {
7881
case XML_EXP_EMPTY:
7882
xmlBufferWriteChar(buf, "empty");
7883
break;
7884
case XML_EXP_FORBID:
7885
xmlBufferWriteChar(buf, "forbidden");
7886
break;
7887
case XML_EXP_ATOM:
7888
xmlBufferWriteCHAR(buf, expr->exp_str);
7889
break;
7890
case XML_EXP_SEQ:
7891
c = expr->exp_left;
7892
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7893
xmlExpDumpInt(buf, c, 1);
7894
else
7895
xmlExpDumpInt(buf, c, 0);
7896
xmlBufferWriteChar(buf, " , ");
7897
c = expr->exp_right;
7898
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7899
xmlExpDumpInt(buf, c, 1);
7900
else
7901
xmlExpDumpInt(buf, c, 0);
7902
break;
7903
case XML_EXP_OR:
7904
c = expr->exp_left;
7905
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7906
xmlExpDumpInt(buf, c, 1);
7907
else
7908
xmlExpDumpInt(buf, c, 0);
7909
xmlBufferWriteChar(buf, " | ");
7910
c = expr->exp_right;
7911
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7912
xmlExpDumpInt(buf, c, 1);
7913
else
7914
xmlExpDumpInt(buf, c, 0);
7915
break;
7916
case XML_EXP_COUNT: {
7917
char rep[40];
7918
7919
c = expr->exp_left;
7920
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7921
xmlExpDumpInt(buf, c, 1);
7922
else
7923
xmlExpDumpInt(buf, c, 0);
7924
if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
7925
rep[0] = '?';
7926
rep[1] = 0;
7927
} else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
7928
rep[0] = '*';
7929
rep[1] = 0;
7930
} else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
7931
rep[0] = '+';
7932
rep[1] = 0;
7933
} else if (expr->exp_max == expr->exp_min) {
7934
snprintf(rep, 39, "{%d}", expr->exp_min);
7935
} else if (expr->exp_max < 0) {
7936
snprintf(rep, 39, "{%d,inf}", expr->exp_min);
7937
} else {
7938
snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
7939
}
7940
rep[39] = 0;
7941
xmlBufferWriteChar(buf, rep);
7942
break;
7943
}
7944
default:
7945
fprintf(stderr, "Error in tree\n");
7946
}
7947
if (glob)
7948
xmlBufferWriteChar(buf, ")");
7949
}
7950
/**
7951
* xmlExpDump:
7952
* @buf: a buffer to receive the output
7953
* @expr: the compiled expression
7954
*
7955
* Serialize the expression as compiled to the buffer
7956
*/
7957
void
7958
xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
7959
if ((buf == NULL) || (expr == NULL))
7960
return;
7961
xmlExpDumpInt(buf, expr, 0);
7962
}
7963
7964
/**
7965
* xmlExpMaxToken:
7966
* @expr: a compiled expression
7967
*
7968
* Indicate the maximum number of input a expression can accept
7969
*
7970
* Returns the maximum length or -1 in case of error
7971
*/
7972
int
7973
xmlExpMaxToken(xmlExpNodePtr expr) {
7974
if (expr == NULL)
7975
return(-1);
7976
return(expr->c_max);
7977
}
7978
7979
/**
7980
* xmlExpCtxtNbNodes:
7981
* @ctxt: an expression context
7982
*
7983
* Debugging facility provides the number of allocated nodes at a that point
7984
*
7985
* Returns the number of nodes in use or -1 in case of error
7986
*/
7987
int
7988
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
7989
if (ctxt == NULL)
7990
return(-1);
7991
return(ctxt->nb_nodes);
7992
}
7993
7994
/**
7995
* xmlExpCtxtNbCons:
7996
* @ctxt: an expression context
7997
*
7998
* Debugging facility provides the number of allocated nodes over lifetime
7999
*
8000
* Returns the number of nodes ever allocated or -1 in case of error
8001
*/
8002
int
8003
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8004
if (ctxt == NULL)
8005
return(-1);
8006
return(ctxt->nb_cons);
8007
}
8008
8009
#endif /* LIBXML_EXPR_ENABLED */
8010
8011
#endif /* LIBXML_REGEXP_ENABLED */
8012
8013