;;; -*- Mode:LISP; Package:USER; Readtable:CL; Base:10 -*- ;;; Simple, half implemented, regular expression parser. ;;; See gnu emacs manual for details. Needs chains to run. (defconstant *regexp-not-newline* #\.) (defconstant *regexp-beginning-of-line* #\^) (defconstant *regexp-end-of-line* #\$) (defconstant *regexp-character-set* #\[) (defconstant *regexp-character-set-end* #\]) (defconstant *regexp-character-set-range-marker* #\-) (defconstant *regexp-character-set-invert-marker* #\^) (defconstant *regexp-zero-or-more* #\*) (defconstant *regexp-one-or-more* #\+) (defconstant *regexp-none-or-one* #\?) (defconstant *regexp-escape-character* #\\) (defconstant *regexp-alternative* #\|) (defconstant *regexp-begin-group* #\() (defconstant *regexp-end-group* #\)) (defconstant *regexp-group-1* #\1) (defconstant *regexp-group-2* #\2) (defconstant *regexp-group-3* #\3) (defconstant *regexp-group-4* #\4) (defconstant *regexp-group-5* #\5) (defconstant *regexp-group-6* #\6) (defconstant *regexp-group-7* #\7) (defconstant *regexp-group-8* #\8) (defconstant *regexp-group-9* #\9) (defconstant *regexp-beginning-of-buffer* #\`) (defconstant *regexp-end-of-buffer* #\') (defconstant *regexp-word-marker* #\b) (defconstant *regexp-not-word* #\B) (defconstant *regexp-begin-word* #\<) (defconstant *regexp-end-word* #\>) (defconstant *regexp-word-constituent* #\w) (defconstant *regexp-not-word-constituent* #\W) (defconstant *regexp-special-syntax* #\s) (defconstant *regexp-special-not-syntax* #\S) (defun tokenize-escapes (chain) (if (empty-chain? chain) the-empty-chain (let ((h (head chain))) (if (eql h *regexp-escape-character*) (let ((n (head (tail chain)))) (case n (#.*regexp-alternative* (cons-chain (list 'alternative) (tokenize-escapes (tail (tail chain))))) (#.*regexp-begin-group* (cons-chain (list 'begin-group) (tokenize-escapes (tail (tail chain))))) (#.*regexp-end-group* (cons-chain (list 'end-group) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-1* (cons-chain (list 'group-1) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-2* (cons-chain (list 'group-2) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-3* (cons-chain (list 'group-3) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-4* (cons-chain (list 'group-4) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-5* (cons-chain (list 'group-5) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-6* (cons-chain (list 'group-6) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-7* (cons-chain (list 'group-7) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-8* (cons-chain (list 'group-8) (tokenize-escapes (tail (tail chain))))) (#.*regexp-group-9* (cons-chain (list 'group-9) (tokenize-escapes (tail (tail chain))))) (#.*regexp-beginning-of-buffer* (cons-chain (list 'beginning-of-buffer) (tokenize-escapes (tail (tail chain))))) (#.*regexp-end-of-buffer* (cons-chain (list 'end-of-buffer) (tokenize-escapes (tail (tail chain))))) (#.*regexp-word-marker* (cons-chain (list 'word) (tokenize-escapes (tail (tail chain))))) (#.*regexp-not-word* (cons-chain (list 'not-word) (tokenize-escapes (tail (tail chain))))) (#.*regexp-begin-word* (cons-chain (list 'begin-word) (tokenize-escapes (tail (tail chain))))) (#.*regexp-end-word* (cons-chain (list 'end-word) (tokenize-escapes (tail (tail chain))))) (#.*regexp-word-constituent* (cons-chain (list 'word-constituent) (tokenize-escapes (tail (tail chain))))) (#.*regexp-not-word-constituent* (cons-chain (list 'not-word-constituent) (tokenize-escapes (tail (tail chain))))) (#.*regexp-special-syntax* (cons-chain (list 'special-syntax (head (tail (tail chain)))) (tokenize-escapes (tail (tail (tail chain)))))) (#.*regexp-special-not-syntax* (cons-chain (list 'special-not-syntax (head (tail (tail chain)))) (tokenize-escapes (tail (tail (tail chain)))))) (otherwise (cons-chain (list 'literal n) (tokenize-escapes (tail (tail chain))))))) (cons-chain h (tokenize-escapes (tail chain))))))) (defun parse-individual-characters (chain) (if (empty-chain? chain) the-empty-chain (let ((h (head chain))) (labels ((continue (tail) (parse-individual-characters tail))) (cond ((characterp h) (case h (#.*regexp-not-newline* (parse-not-newline (tail chain) #'continue)) (#.*regexp-beginning-of-line* (parse-beginning-of-line (tail chain) #'continue)) (#.*regexp-end-of-line* (parse-end-of-line (tail chain) #'continue)) (#.*regexp-character-set* (parse-character-set chain #'continue)) (#.*regexp-zero-or-more* (parse-zero-or-more (tail chain) #'continue)) (#.*regexp-one-or-more* (parse-one-or-more (tail chain) #'continue)) (#.*regexp-none-or-one* (parse-none-or-one (tail chain) #'continue)) (otherwise (cons-chain (list 'literal h) (parse-individual-characters (tail chain) ))))) ((not (consp h)) (ferror nil "Unknown object in chain.")) ((eq (car h) 'literal) (cons-chain h (parse-individual-characters (tail chain)))) ((eq (car h) 'begin-group) (labels ((accumulate (chain stuff) (if (empty-chain? chain) (ferror nil "Unexpected end of chain.") (let ((h (head chain))) (if (and (consp h) (eq (car h) 'end-group)) (cons-chain (list 'group (reverse stuff)) (tail chain)) (accumulate (tail chain) (cons h stuff))))))) (accumulate (parse-individual-characters (tail chain)) '()))) (t (cons-chain h (parse-individual-characters (tail chain))))))))) (defun simple-parse (token) #'(lambda (chain cont) (cons-chain token (funcall cont chain)))) (deff parse-not-newline (simple-parse (list 'not-newline))) (deff parse-beginning-of-line (simple-parse (list 'beginning-of-line))) (deff parse-end-of-line (simple-parse (list 'end-of-line))) (deff parse-zero-or-more (simple-parse (list 'zero-or-more))) (deff parse-one-or-more (simple-parse (list 'one-or-more))) (deff parse-none-or-one (simple-parse (list 'none-or-one))) (defun parse-character-set (chain cont) (collect-character-set chain cont)) (defun collect-character-set (chain cont) (labels ((collect-characters (elements chain) (if (empty-chain? chain) (ferror nil "Unexpected end of chain.") (let ((h (head (tail chain)))) (if (eql h *character-set-end*) (let ((set (expand-character-ranges elements))) (if (eql (first set) *character-invert-set-marker*) (cons-chain (list 'complement-character-set (rest set)) (funcall cont (tail (tail chain)))) (cons-chain (list 'character-set set) (funcall cont (tail (tail chain)))))) (collect-characters (cons h elements) (tail chain))))))) (collect-characters (list (head (tail chain))) (tail chain)))) (defun expand-character-ranges (elements) (labels ((scanner (previous this next) (if (null this) previous (if (eql this *character-range-marker*) (cond ((null previous) (scanner (cons this previous) (first next) (rest next))) ((null next) (ferror nil "Range starts string.")) (t (let ((before (char-int (first next))) (after (char-int (first previous)))) (cond ((> before after) (ferror nil "Range is backward.")) ((= before after) (scanner previous (first (rest next)) (rest (rest next)))) (t (scanner (cons (int-char (1- (char-int (first previous)))) previous) this next)))))) (scanner (cons this previous) (first next) (rest next)))))) (scanner '() (first elements) (rest elements)))) (defun convert-to-prefix-notation (token-list) (labels ((scanner (previous this next) (if (null this) (reverse previous) (let ((tag (car this))) (labels ((gobble-previous-token () (scanner (cons (cons tag (list (car previous))) (cdr previous)) (car next) (cdr next)))) (case tag (zero-or-more (gobble-previous-token)) (one-or-more (gobble-previous-token)) (none-or-one (gobble-previous-token)) (alternative (list (cons tag (cons (reverse previous) (list (convert-to-prefix-notation next)))))) (group (scanner (cons (cons tag (convert-to-prefix-notation (cadr this))) previous) (car next) (cdr next))) (otherwise (scanner (cons this previous) (car next) (cdr next))))))))) (scanner '() (car token-list) (rest token-list)))) (defun compile-token-list (tok) (if (null tok) the-empty-chain (cons-chain (compile-token (car tok)) (compile-token-list (cdr tok))))) (defun compile-token (tok) (case (car tok) (literal (compile-literal (cadr tok))) (not-newline (compile-not-newline)) (zero-or-more (compile-zero-or-more (compile-token (cadr tok)))) (one-or-more (compile-one-or-more (compile-token (cadr tok)))) (none-or-one (compile-none-or-one (compile-token (cadr tok)))) (character-set (compile-character-set (cadr tok))) (complement-character-set (compile-complement-character-set (cadr tok))) (otherwise (ferror nil "Unknown regexp operator ~s" (car tok))))) (defun compile-literal (char) #'(lambda (stream tail-matcher if-not) (cond ((empty-chain? stream) (funcall if-not)) ((eq (head stream) char) (funcall tail-matcher (tail stream))) (t (funcall if-not))))) (defun compile-not-newline () #'(lambda (stream tail-matcher if-not) (cond ((empty-chain? stream) (funcall if-not)) ((eq (head stream) #\newline) (funcall if-not)) (t (funcall tail-matcher (tail stream)))))) (defun compile-zero-or-more (tester) (labels ((matcher (stream tail-matcher if-not) (if (empty-chain? stream) (funcall tail-matcher the-empty-chain) (funcall tester stream #'(lambda (rest) (or (matcher rest tail-matcher if-not) (funcall tail-matcher rest) (funcall tail-matcher stream))) #'(lambda () (funcall tail-matcher stream)))))) #'matcher)) (defun compile-one-or-more (tester) (labels ((matcher (stream tail-matcher if-not) (if (empty-chain? stream) (funcall tail-matcher the-empty-chain) (funcall tester stream #'(lambda (rest) (or (matcher rest tail-matcher if-not) (funcall tail-matcher rest) (funcall if-not))) if-not)))) #'matcher)) (defun compile-none-or-one (tester) (labels ((matcher (stream tail-matcher if-not) (if (empty-chain? stream) (funcall tail-matcher the-empty-chain) (funcall tester stream #'(lambda (rest) (or (funcall tail-matcher rest) (funcall tail-matcher stream))) #'(lambda () (funcall tail-matcher stream)))))) #'matcher)) (defun compile-character-set (set) #'(lambda (chain if-matches if-not) (cond ((empty-chain? chain) (funcall if-not)) ((member (head chain) set :test #'char=) (funcall if-matches (tail chain))) (t (funcall if-not))))) (defun compile-complement-character-set (set) #'(lambda (chain if-matches if-not) (cond ((empty-chain? chain) (funcall if-not)) ((member (head chain) set :test #'char=) (funcall if-not)) (t (funcall if-matches (tail chain)))))) (defun compile-regexp (r) (compile-token-list (convert-to-prefix-notation (chain->list (parse-individual-characters (tokenize-escapes (string->chain r))))))) (defun match-one-token-compiled (r chain if-wins if-loses) (if (empty-chain? r) (funcall if-wins chain) (funcall (head r) chain #'(lambda (rest) (funcall if-wins rest)) if-loses))) (defun match-tokens-compiled (r chain if-win if-lose) (labels ((match-loop (rest-matchers rest-chain) (if (empty-chain? rest-matchers) (funcall if-win) (match-one-token-compiled rest-matchers rest-chain #'(lambda (to-be-matched) (match-loop (tail rest-matchers) to-be-matched)) #'(lambda () (funcall if-lose)))))) (match-loop r chain))) (defun match-compiled (r chain) (match-tokens-compiled r chain #'(lambda () t) #'(lambda () nil))) (defun match-regexp (r string) (match-compiled (compile-regexp r) (string->chain string)))