Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
marvel
GitHub Repository: marvel/qnf
Path: blob/master/elisp/slime/nregex.lisp
989 views
1
;;;
2
;;; This code was written by:
3
;;;
4
;;; Lawrence E. Freil <[email protected]>
5
;;; National Science Center Foundation
6
;;; Augusta, Georgia 30909
7
;;;
8
;;; This program was released into the public domain on 2005-08-31.
9
;;; (See the slime-devel mailing list archive for details.)
10
;;;
11
;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
12
;;; parser.
13
;;;
14
;;; This regular expression parser operates by taking a
15
;;; regular expression and breaking it down into a list
16
;;; consisting of lisp expressions and flags. The list
17
;;; of lisp expressions is then taken in turned into a
18
;;; lambda expression that can be later applied to a
19
;;; string argument for parsing.
20
;;;;
21
;;;; Modifications made 6 March 2001 By Chris Double ([email protected])
22
;;;; to get working with Corman Lisp 1.42, add package statement and export
23
;;;; relevant functions.
24
;;;;
25
26
(in-package :cl-user)
27
28
;; Renamed to slime-nregex avoid name clashes with other versions of
29
;; this file. -- he
30
31
;;;; CND - 6/3/2001
32
(defpackage slime-nregex
33
(:use #:common-lisp)
34
(:export
35
#:regex
36
#:regex-compile
37
))
38
39
;;;; CND - 6/3/2001
40
(in-package :slime-nregex)
41
42
;;;
43
;;; First we create a copy of macros to help debug the beast
44
(eval-when (:compile-toplevel :load-toplevel :execute)
45
(defvar *regex-debug* nil) ; Set to nil for no debugging code
46
)
47
48
(defmacro info (message &rest args)
49
(if *regex-debug*
50
`(format *standard-output* ,message ,@args)))
51
52
;;;
53
;;; Declare the global variables for storing the paren index list.
54
;;;
55
(defvar *regex-groups* (make-array 10))
56
(defvar *regex-groupings* 0)
57
58
;;;
59
;;; Declare a simple interface for testing. You probably wouldn't want
60
;;; to use this interface unless you were just calling this once.
61
;;;
62
(defun regex (expression string)
63
"Usage: (regex <expression> <string)
64
This function will call regex-compile on the expression and then apply
65
the string to the returned lambda list."
66
(let ((findit (cond ((stringp expression)
67
(regex-compile expression))
68
((listp expression)
69
expression)))
70
(result nil))
71
(if (not (funcall (if (functionp findit)
72
findit
73
(eval `(function ,findit))) string))
74
(return-from regex nil))
75
(if (= *regex-groupings* 0)
76
(return-from regex t))
77
(dotimes (i *regex-groupings*)
78
(push (funcall 'subseq
79
string
80
(car (aref *regex-groups* i))
81
(cadr (aref *regex-groups* i)))
82
result))
83
(reverse result)))
84
85
;;;
86
;;; Declare some simple macros to make the code more readable.
87
;;;
88
(defvar *regex-special-chars* "?*+.()[]\\${}")
89
90
(defmacro add-exp (list)
91
"Add an item to the end of expression"
92
`(setf expression (append expression ,list)))
93
94
;;;
95
;;; Define a function that will take a quoted character and return
96
;;; what the real character should be plus how much of the source
97
;;; string was used. If the result is a set of characters, return an
98
;;; array of bits indicating which characters should be set. If the
99
;;; expression is one of the sub-group matches return a
100
;;; list-expression that will provide the match.
101
;;;
102
(defun regex-quoted (char-string &optional (invert nil))
103
"Usage: (regex-quoted <char-string> &optional invert)
104
Returns either the quoted character or a simple bit vector of bits set for
105
the matching values"
106
(let ((first (char char-string 0))
107
(result (char char-string 0))
108
(used-length 1))
109
(cond ((eql first #\n)
110
(setf result #\NewLine))
111
((eql first #\c)
112
(setf result #\Return))
113
((eql first #\t)
114
(setf result #\Tab))
115
((eql first #\d)
116
(setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
117
((eql first #\D)
118
(setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
119
((eql first #\w)
120
(setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
121
((eql first #\W)
122
(setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
123
((eql first #\b)
124
(setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
125
((eql first #\B)
126
(setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
127
((eql first #\s)
128
(setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
129
((eql first #\S)
130
(setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
131
((and (>= (char-code first) (char-code #\0))
132
(<= (char-code first) (char-code #\9)))
133
(if (and (> (length char-string) 2)
134
(and (>= (char-code (char char-string 1)) (char-code #\0))
135
(<= (char-code (char char-string 1)) (char-code #\9))
136
(>= (char-code (char char-string 2)) (char-code #\0))
137
(<= (char-code (char char-string 2)) (char-code #\9))))
138
;;
139
;; It is a single character specified in octal
140
;;
141
(progn
142
(setf result (do ((x 0 (1+ x))
143
(return 0))
144
((= x 2) return)
145
(setf return (+ (* return 8)
146
(- (char-code (char char-string x))
147
(char-code #\0))))))
148
(setf used-length 3))
149
;;
150
;; We have a group number replacement.
151
;;
152
(let ((group (- (char-code first) (char-code #\0))))
153
(setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
154
(cadr (aref *regex-groups* ,group)))))
155
(if (< length (+ index (length nstring)))
156
(return-from compare nil))
157
(if (not (string= string nstring
158
:start1 index
159
:end1 (+ index (length nstring))))
160
(return-from compare nil)
161
(incf index (length nstring)))))))))
162
(t
163
(setf result first)))
164
(if (and (vectorp result) invert)
165
(bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
166
(values result used-length)))
167
168
;;;
169
;;; Now for the main regex compiler routine.
170
;;;
171
(defun regex-compile (source &key (anchored nil))
172
"Usage: (regex-compile <expression> [ :anchored (t/nil) ])
173
This function take a regular expression (supplied as source) and
174
compiles this into a lambda list that a string argument can then
175
be applied to. It is also possible to compile this lambda list
176
for better performance or to save it as a named function for later
177
use"
178
(info "Now entering regex-compile with \"~A\"~%" source)
179
;;
180
;; This routine works in two parts.
181
;; The first pass take the regular expression and produces a list of
182
;; operators and lisp expressions for the entire regular expression.
183
;; The second pass takes this list and produces the lambda expression.
184
(let ((expression '()) ; holder for expressions
185
(group 1) ; Current group index
186
(group-stack nil) ; Stack of current group endings
187
(result nil) ; holder for built expression.
188
(fast-first nil)) ; holder for quick unanchored scan
189
;;
190
;; If the expression was an empty string then it alway
191
;; matches (so lets leave early)
192
;;
193
(if (= (length source) 0)
194
(return-from regex-compile
195
'(lambda (&rest args)
196
(declare (ignore args))
197
t)))
198
;;
199
;; If the first character is a caret then set the anchored
200
;; flags and remove if from the expression string.
201
;;
202
(cond ((eql (char source 0) #\^)
203
(setf source (subseq source 1))
204
(setf anchored t)))
205
;;
206
;; If the first sequence is .* then also set the anchored flags.
207
;; (This is purely for optimization, it will work without this).
208
;;
209
(if (>= (length source) 2)
210
(if (string= source ".*" :start1 0 :end1 2)
211
(setf anchored t)))
212
;;
213
;; Also, If this is not an anchored search and the first character is
214
;; a literal, then do a quick scan to see if it is even in the string.
215
;; If not then we can issue a quick nil,
216
;; otherwise we can start the search at the matching character to skip
217
;; the checks of the non-matching characters anyway.
218
;;
219
;; If I really wanted to speed up this section of code it would be
220
;; easy to recognize the case of a fairly long multi-character literal
221
;; and generate a Boyer-Moore search for the entire literal.
222
;;
223
;; I generate the code to do a loop because on CMU Lisp this is about
224
;; twice as fast a calling position.
225
;;
226
(if (and (not anchored)
227
(not (position (char source 0) *regex-special-chars*))
228
(not (and (> (length source) 1)
229
(position (char source 1) *regex-special-chars*))))
230
(setf fast-first `((if (not (dotimes (i length nil)
231
(if (eql (char string i)
232
,(char source 0))
233
(return (setf start i)))))
234
(return-from final-return nil)))))
235
;;
236
;; Generate the very first expression to save the starting index
237
;; so that group 0 will be the entire string matched always
238
;;
239
(add-exp '((setf (aref *regex-groups* 0)
240
(list index nil))))
241
;;
242
;; Loop over each character in the regular expression building the
243
;; expression list as we go.
244
;;
245
(do ((eindex 0 (1+ eindex)))
246
((= eindex (length source)))
247
(let ((current (char source eindex)))
248
(info "Now processing character ~A index = ~A~%" current eindex)
249
(case current
250
((#\.)
251
;;
252
;; Generate code for a single wild character
253
;;
254
(add-exp '((if (>= index length)
255
(return-from compare nil)
256
(incf index)))))
257
((#\$)
258
;;
259
;; If this is the last character of the expression then
260
;; anchor the end of the expression, otherwise let it slide
261
;; as a standard character (even though it should be quoted).
262
;;
263
(if (= eindex (1- (length source)))
264
(add-exp '((if (not (= index length))
265
(return-from compare nil))))
266
(add-exp '((if (not (and (< index length)
267
(eql (char string index) #\$)))
268
(return-from compare nil)
269
(incf index))))))
270
((#\*)
271
(add-exp '(ASTRISK)))
272
273
((#\+)
274
(add-exp '(PLUS)))
275
276
((#\?)
277
(add-exp '(QUESTION)))
278
279
((#\()
280
;;
281
;; Start a grouping.
282
;;
283
(incf group)
284
(push group group-stack)
285
(add-exp `((setf (aref *regex-groups* ,(1- group))
286
(list index nil))))
287
(add-exp `(,group)))
288
((#\))
289
;;
290
;; End a grouping
291
;;
292
(let ((group (pop group-stack)))
293
(add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
294
index)))
295
(add-exp `(,(- group)))))
296
((#\[)
297
;;
298
;; Start of a range operation.
299
;; Generate a bit-vector that has one bit per possible character
300
;; and then on each character or range, set the possible bits.
301
;;
302
;; If the first character is carat then invert the set.
303
(let* ((invert (eql (char source (1+ eindex)) #\^))
304
(bitstring (make-array 256 :element-type 'bit
305
:initial-element
306
(if invert 1 0)))
307
(set-char (if invert 0 1)))
308
(if invert (incf eindex))
309
(do ((x (1+ eindex) (1+ x)))
310
((eql (char source x) #\]) (setf eindex x))
311
(info "Building range with character ~A~%" (char source x))
312
(cond ((and (eql (char source (1+ x)) #\-)
313
(not (eql (char source (+ x 2)) #\])))
314
(if (>= (char-code (char source x))
315
(char-code (char source (+ 2 x))))
316
(error "Invalid range \"~A-~A\". Ranges must be in acending order"
317
(char source x) (char source (+ 2 x))))
318
(do ((j (char-code (char source x)) (1+ j)))
319
((> j (char-code (char source (+ 2 x))))
320
(incf x 2))
321
(info "Setting bit for char ~A code ~A~%" (code-char j) j)
322
(setf (sbit bitstring j) set-char)))
323
(t
324
(cond ((not (eql (char source x) #\]))
325
(let ((char (char source x)))
326
;;
327
;; If the character is quoted then find out what
328
;; it should have been
329
;;
330
(if (eql (char source x) #\\ )
331
(let ((length))
332
(multiple-value-setq (char length)
333
(regex-quoted (subseq source x) invert))
334
(incf x length)))
335
(info "Setting bit for char ~A code ~A~%" char (char-code char))
336
(if (not (vectorp char))
337
(setf (sbit bitstring (char-code (char source x))) set-char)
338
(bit-ior bitstring char t))))))))
339
(add-exp `((let ((range ,bitstring))
340
(if (>= index length)
341
(return-from compare nil))
342
(if (= 1 (sbit range (char-code (char string index))))
343
(incf index)
344
(return-from compare nil)))))))
345
((#\\ )
346
;;
347
;; Intreprete the next character as a special, range, octal, group or
348
;; just the character itself.
349
;;
350
(let ((length)
351
(value))
352
(multiple-value-setq (value length)
353
(regex-quoted (subseq source (1+ eindex)) nil))
354
(cond ((listp value)
355
(add-exp value))
356
((characterp value)
357
(add-exp `((if (not (and (< index length)
358
(eql (char string index)
359
,value)))
360
(return-from compare nil)
361
(incf index)))))
362
((vectorp value)
363
(add-exp `((let ((range ,value))
364
(if (>= index length)
365
(return-from compare nil))
366
(if (= 1 (sbit range (char-code (char string index))))
367
(incf index)
368
(return-from compare nil)))))))
369
(incf eindex length)))
370
(t
371
;;
372
;; We have a literal character.
373
;; Scan to see how many we have and if it is more than one
374
;; generate a string= verses as single eql.
375
;;
376
(let* ((lit "")
377
(term (dotimes (litindex (- (length source) eindex) nil)
378
(let ((litchar (char source (+ eindex litindex))))
379
(if (position litchar *regex-special-chars*)
380
(return litchar)
381
(progn
382
(info "Now adding ~A index ~A to lit~%" litchar
383
litindex)
384
(setf lit (concatenate 'string lit
385
(string litchar)))))))))
386
(if (= (length lit) 1)
387
(add-exp `((if (not (and (< index length)
388
(eql (char string index) ,current)))
389
(return-from compare nil)
390
(incf index))))
391
;;
392
;; If we have a multi-character literal then we must
393
;; check to see if the next character (if there is one)
394
;; is an astrisk or a plus or a question mark. If so then we must not use this
395
;; character in the big literal.
396
(progn
397
(if (or (eql term #\*)
398
(eql term #\+)
399
(eql term #\?))
400
(setf lit (subseq lit 0 (1- (length lit)))))
401
(add-exp `((if (< length (+ index ,(length lit)))
402
(return-from compare nil))
403
(if (not (string= string ,lit :start1 index
404
:end1 (+ index ,(length lit))))
405
(return-from compare nil)
406
(incf index ,(length lit)))))))
407
(incf eindex (1- (length lit))))))))
408
;;
409
;; Plug end of list to return t. If we made it this far then
410
;; We have matched!
411
(add-exp '((setf (cadr (aref *regex-groups* 0))
412
index)))
413
(add-exp '((return-from final-return t)))
414
;;
415
;;; (print expression)
416
;;
417
;; Now take the expression list and turn it into a lambda expression
418
;; replacing the special flags with lisp code.
419
;; For example: A BEGIN needs to be replace by an expression that
420
;; saves the current index, then evaluates everything till it gets to
421
;; the END then save the new index if it didn't fail.
422
;; On an ASTRISK I need to take the previous expression and wrap
423
;; it in a do that will evaluate the expression till an error
424
;; occurs and then another do that encompases the remainder of the
425
;; regular expression and iterates decrementing the index by one
426
;; of the matched expression sizes and then returns nil. After
427
;; the last expression insert a form that does a return t so that
428
;; if the entire nested sub-expression succeeds then the loop
429
;; is broken manually.
430
;;
431
(setf result (copy-tree nil))
432
;;
433
;; Reversing the current expression makes building up the
434
;; lambda list easier due to the nexting of expressions when
435
;; and astrisk has been encountered.
436
(setf expression (reverse expression))
437
(do ((elt 0 (1+ elt)))
438
((>= elt (length expression)))
439
(let ((piece (nth elt expression)))
440
;;
441
;; Now check for PLUS, if so then ditto the expression and then let the
442
;; ASTRISK below handle the rest.
443
;;
444
(cond ((eql piece 'PLUS)
445
(cond ((listp (nth (1+ elt) expression))
446
(setf result (append (list (nth (1+ elt) expression))
447
result)))
448
;;
449
;; duplicate the entire group
450
;; NOTE: This hasn't been implemented yet!!
451
(t
452
(error "GROUP repeat hasn't been implemented yet~%")))))
453
(cond ((listp piece) ;Just append the list
454
(setf result (append (list piece) result)))
455
((eql piece 'QUESTION) ; Wrap it in a block that won't fail
456
(cond ((listp (nth (1+ elt) expression))
457
(setf result
458
(append `((progn (block compare
459
,(nth (1+ elt)
460
expression))
461
t))
462
result))
463
(incf elt))
464
;;
465
;; This is a QUESTION on an entire group which
466
;; hasn't been implemented yet!!!
467
;;
468
(t
469
(error "Optional groups not implemented yet~%"))))
470
((or (eql piece 'ASTRISK) ; Do the wild thing!
471
(eql piece 'PLUS))
472
(cond ((listp (nth (1+ elt) expression))
473
;;
474
;; This is a single character wild card so
475
;; do the simple form.
476
;;
477
(setf result
478
`((let ((oindex index))
479
(block compare
480
(do ()
481
(nil)
482
,(nth (1+ elt) expression)))
483
(do ((start index (1- start)))
484
((< start oindex) nil)
485
(let ((index start))
486
(block compare
487
,@result))))))
488
(incf elt))
489
(t
490
;;
491
;; This is a subgroup repeated so I must build
492
;; the loop using several values.
493
;;
494
))
495
)
496
(t t)))) ; Just ignore everything else.
497
;;
498
;; Now wrap the result in a lambda list that can then be
499
;; invoked or compiled, however the user wishes.
500
;;
501
(if anchored
502
(setf result
503
`(lambda (string &key (start 0) (end (length string)))
504
(setf *regex-groupings* ,group)
505
(block final-return
506
(block compare
507
(let ((index start)
508
(length end))
509
,@result)))))
510
(setf result
511
`(lambda (string &key (start 0) (end (length string)))
512
(setf *regex-groupings* ,group)
513
(block final-return
514
(let ((length end))
515
,@fast-first
516
(do ((marker start (1+ marker)))
517
((> marker end) nil)
518
(let ((index marker))
519
(if (block compare
520
,@result)
521
(return t)))))))))))
522
523
;; (provide 'nregex)
524
525