diff options
author | David Thompson <dthompson2@worcester.edu> | 2016-02-16 12:44:24 -0500 |
---|---|---|
committer | David Thompson <dthompson2@worcester.edu> | 2016-02-16 12:44:24 -0500 |
commit | 26eaa6e5978b02dedfa7aae7f10be3b0c4d3cb15 (patch) | |
tree | ba501ab16485db702662aefb5d578fc0fd1b2b73 | |
parent | 268fdfe6efbce76f316d96cc6efc9721766f9c22 (diff) |
Add lexers module.
This starts the transition away from the flawed parser combinators to
something really similar, but more capable.
* syntax-highlight/lexers.scm: New file.
* Makefile.am(SOURCES): Add it.
-rw-r--r-- | Makefile.am | 1 | ||||
-rw-r--r-- | syntax-highlight/lexers.scm | 333 |
2 files changed, 334 insertions, 0 deletions
diff --git a/Makefile.am b/Makefile.am index c838685..965ee08 100644 --- a/Makefile.am +++ b/Makefile.am @@ -40,6 +40,7 @@ godir=$(libdir)/guile/2.0/ccache SOURCES = \ syntax-highlight/utils.scm \ + syntax-highlight/lexers.scm \ syntax-highlight/parsers.scm \ syntax-highlight/scheme.scm \ syntax-highlight/xml.scm \ diff --git a/syntax-highlight/lexers.scm b/syntax-highlight/lexers.scm new file mode 100644 index 0000000..7ff5b09 --- /dev/null +++ b/syntax-highlight/lexers.scm @@ -0,0 +1,333 @@ +;;; guile-syntax-highlight --- General-purpose syntax highlighter +;;; Copyright © 2015 David Thompson <davet@gnu.org> +;;; +;;; Guile-syntax-highlight is free software; you can redistribute it +;;; and/or modify it under the terms of the GNU Lesser General Public +;;; License as published by the Free Software Foundation; either +;;; version 3 of the License, or (at your option) any later version. +;;; +;;; Guile-syntax-highlight is distributed in the hope that it will be +;;; useful, but WITHOUT ANY WARRANTY; without even the implied +;;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +;;; See the GNU Lesser General Public License for more details. +;;; +;;; You should have received a copy of the GNU Lesser General Public +;;; License along with guile-syntax-highlight. If not, see +;;; <http://www.gnu.org/licenses/>. + +;;; Commentary: +;; +;; Lexing utilities. +;; +;;; Code: + +(define-module (syntax-highlight lexers) + #:use-module (ice-9 match) + #:use-module (ice-9 regex) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-9) + #:use-module (srfi srfi-11) + #:use-module (srfi srfi-26) + #:export (make-cursor + cursor? + cursor-text + cursor-position + cursor-end? + move-cursor + move-cursor-by + move-cursor-to-end + string->cursor + cursor->string + + empty-tokens + token-add + token-drop + token-take + token-peek + tokens->list + + fail + lex-fail + lex-identity + lex-cons + lex-bind + lex-filter + lex-any* + lex-any + lex-all* + lex-all + lex-zero-or-more + lex-consume + lex-maybe + lex-regexp + lex-string + lex-char-set + lex-whitespace + lex-delimited + lex-tag)) + +(define (string-prefix?* s1 s2 start-s2) + (string-prefix? s1 s2 0 (string-length s1) start-s2)) + + +;;; +;;; Cursor +;;; + +(define-record-type <cursor> + (make-cursor text position) + cursor? + (text cursor-text) + (position cursor-position)) + +(define (cursor-end? cursor) + "Return #t if the cursor is at the end of the text." + (>= (cursor-position cursor) (string-length (cursor-text cursor)))) + +(define (move-cursor cursor position) + "Move CURSOR to the character at POSITION." + (make-cursor (cursor-text cursor) position)) + +(define (move-cursor-by cursor offset) + "Move CURSOR by OFFSET characters relative to its current +position." + (move-cursor cursor (+ (cursor-position cursor) offset))) + +(define (move-cursor-to-end cursor) + (move-cursor cursor (string-length (cursor-text cursor)))) + +(define (string->cursor str) + (make-cursor str 0)) + +(define (cursor->string cursor) + (substring (cursor-text cursor) (cursor-position cursor))) + + +;;; +;;; Tokens +;;; + +(define empty-tokens '()) + +(define (token-add tokens new-token) + (cons new-token tokens)) + +(define (token-peek tokens) + (car tokens)) + +(define (token-drop tokens n) + (drop tokens n)) + +(define (token-take tokens n) + (take tokens n)) + +(define (tokens->list tokens) + ;; Tokens are accumulated as a stack i.e. in reverse order. + (reverse tokens)) + + +;;; +;;; Lexers +;;; + +(define (fail) + (values #f #f)) + +(define (lex-fail tokens cursor) + "Always fail to parse lexemes without advancing CURSOR or altering +TOKENS." + (fail)) + +(define (lex-identity tokens cursor) + (values tokens cursor)) + +(define (lex-bind proc lexer) + "Return a lexer that applies the result of LEXER to PROC, a +procedure that returns a lexer, and then applies that new lexer." + (lambda (tokens cursor) + (let-values (((result remainder) (lexer tokens cursor))) + (if result + ((proc result) remainder) + (fail))))) + +(define (lex-filter predicate lexer) + "Return a lexer that succeeds when LEXER succeeds and the head of +the tokens queue satisfies PREDICATE." + (lambda (tokens cursor) + (let-values (((result remainder) (lexer tokens cursor))) + (if (and result (predicate (token-peek result))) + (values result remainder) + (fail))))) + +(define (lex-any* lexers) + "Return a lexer that succeeds with the result of the first +successful lexer in LEXERS or fails if all lexers fail." + (define (either a b) + (lambda (tokens cursor) + (let-values (((result remainder) (a tokens cursor))) + (if result + (values result remainder) + (b tokens cursor))))) + + (fold-right either lex-fail lexers)) + +(define (lex-any . lexers) + "Return a lexer that succeeds with the result of the first +successful lexer in LEXERS or fails if all lexers fail." + (lex-any* lexers)) + +(define (lex-all* lexers) + "Return a lexer that succeeds with the results of all LEXERS in +order, or fails if any lexer fails." + (define (both a b) + (lambda (tokens cursor) + (let-values (((result-a remainder-a) (a tokens cursor))) + (if result-a + (let-values (((result-b remainder-b) (b result-a remainder-a))) + (if result-b + (values result-b remainder-b) + (fail))) + (fail))))) + + (fold-right both lex-identity lexers)) + +(define (lex-all . lexers) + "Return a lexer that succeeds with the results of all LEXERS in +order, or fails if any lexer fails." + (lex-all* lexers)) + +(define (lex-zero-or-more lexer) + "Create a lexer that uses LEXER as many times as possible until it +fails and return the results of each success in a list. The lexer +always succeeds." + (define (lex tokens cursor) + (let-values (((result remainder) (lexer tokens cursor))) + (if result + (lex result remainder) + (values tokens cursor)))) + + lex) + +(define (lex-consume lexer) + "Return a lexer that always succeeds with a list of as many +consecutive successful applications of LEXER as possible, consuming +the entire input text. Sections of text that could not be lexed are +returned as plain strings." + (define (substring* cursor start) + (substring (cursor-text cursor) start (cursor-position cursor))) + + (define (consume tokens cursor) + (if (cursor-end? cursor) + (values tokens cursor) + (let-values (((result remainder) (lexer tokens cursor))) + (if result + (consume result remainder) + (values (token-add tokens (cursor->string cursor)) + (move-cursor-to-end cursor)))))) + + consume) + +(define (lex-maybe lexer) + "Create a lexer that always succeeds, but tries to use LEXER. If +LEXER succeeds, its result is returned, otherwise the empty string is +returned without consuming any input." + (lambda (tokens cursor) + (let-values (((result remainder) (lexer tokens cursor))) + (if result + (values result remainder) + (values tokens cursor))))) + +(define (lex-regexp pattern) + "Return a lexer that succeeds with the matched substring when the +input matches the string PATTERN." + (let ((rx (make-regexp (string-append "^" pattern)))) + (lambda (tokens cursor) + (if (cursor-end? cursor) + (fail) + (let ((result (regexp-exec rx (cursor-text cursor) + (cursor-position cursor)))) + (if result + (let ((str (match:substring result 0))) + (values (token-add tokens str) + (move-cursor-by cursor (string-length str)))) + (fail))))))) + +(define (lex-string str) + "Return a lexer that succeeds with STR when the input starts with +STR." + (lambda (tokens cursor) + (if (string-prefix?* str (cursor-text cursor) (cursor-position cursor)) + (values (token-add tokens str) + (move-cursor-by cursor (string-length str))) + (fail)))) + +(define (lex-char-set char-set) + "Return a lexer that succeeds with the nonempty input prefix that +matches CHAR-SET, or fails if the first input character does not +belong to CHAR-SET." + (define (char-set-substring str start) + (let ((len (string-length str))) + (let loop ((index start)) + (cond + ((>= index len) + (substring str start len)) + ((char-set-contains? char-set (string-ref str index)) + (loop (1+ index))) + (else + (substring str start index)))))) + + (lambda (tokens cursor) + (match (char-set-substring (cursor-text cursor) (cursor-position cursor)) + ("" (fail)) + (str + (values (token-add tokens str) + (move-cursor-by cursor (string-length str))))))) + +(define lex-whitespace + (lex-char-set char-set:whitespace)) + +(define* (lex-delimited open #:key (until open) (escape #\\) nested?) + "Return a lexer that succeeds with the string delimited by the +opening string OPEN and the closing string UNTIL. Characters within +the delimited expression may be escaped with the character ESCAPE. If +NESTED?, allow for delimited expressions to be arbitrarily nested +within." + (define (delimit str start) + (let ((len (string-length str))) + (let loop ((index start)) + (cond + ;; Out of bounds. + ((>= index len) + len) + ;; Escape character. + ((eqv? escape (string-ref str index)) + (loop (+ index 2))) + ;; Closing delimiter. + ((string-prefix?* until str index) + (+ index (string-length until))) + ;; Nested delimited string. + ((and nested? (string-prefix?* open str index)) + (loop (delimit str (+ index (string-length open))))) + (else + (loop (1+ index))))))) + + (lambda (tokens cursor) + (let ((str (cursor-text cursor)) + (pos (cursor-position cursor))) + (if (string-prefix?* open str pos) + (let ((end (delimit str (+ pos (string-length open))))) + (values (token-add tokens (substring str pos end)) + (move-cursor cursor end))) + (fail))))) + +(define (lex-tag tag lexer) + "Transform the head element of the tokens queue returned by LEXER +into a two element list consisting of the symbol TAG and the element +itself." + (lambda (tokens cursor) + (let-values (((result remainder) (lexer tokens cursor))) + (if result + (values (token-add (token-drop result 1) + (list tag (token-peek result))) + remainder) + (fail))))) |