summaryrefslogtreecommitdiff
path: root/syntax-highlight/lexers.scm
blob: 7ff5b099629ddb22cb43ed58e057692b40b6de08 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
;;; guile-syntax-highlight --- General-purpose syntax highlighter
;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;;
;;; Guile-syntax-highlight is free software; you can redistribute it
;;; and/or modify it under the terms of the GNU Lesser General Public
;;; License as published by the Free Software Foundation; either
;;; version 3 of the License, or (at your option) any later version.
;;;
;;; Guile-syntax-highlight is distributed in the hope that it will be
;;; useful, but WITHOUT ANY WARRANTY; without even the implied
;;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
;;; See the GNU Lesser General Public License for more details.
;;;
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with guile-syntax-highlight.  If not, see
;;; <http://www.gnu.org/licenses/>.

;;; Commentary:
;;
;; Lexing utilities.
;;
;;; Code:

(define-module (syntax-highlight lexers)
  #:use-module (ice-9 match)
  #:use-module (ice-9 regex)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-9)
  #:use-module (srfi srfi-11)
  #:use-module (srfi srfi-26)
  #:export (make-cursor
            cursor?
            cursor-text
            cursor-position
            cursor-end?
            move-cursor
            move-cursor-by
            move-cursor-to-end
            string->cursor
            cursor->string

            empty-tokens
            token-add
            token-drop
            token-take
            token-peek
            tokens->list

            fail
            lex-fail
            lex-identity
            lex-cons
            lex-bind
            lex-filter
            lex-any*
            lex-any
            lex-all*
            lex-all
            lex-zero-or-more
            lex-consume
            lex-maybe
            lex-regexp
            lex-string
            lex-char-set
            lex-whitespace
            lex-delimited
            lex-tag))

(define (string-prefix?* s1 s2 start-s2)
  (string-prefix? s1 s2 0 (string-length s1) start-s2))


;;;
;;; Cursor
;;;

(define-record-type <cursor>
  (make-cursor text position)
  cursor?
  (text cursor-text)
  (position cursor-position))

(define (cursor-end? cursor)
  "Return #t if the cursor is at the end of the text."
  (>= (cursor-position cursor) (string-length (cursor-text cursor))))

(define (move-cursor cursor position)
  "Move CURSOR to the character at POSITION."
  (make-cursor (cursor-text cursor) position))

(define (move-cursor-by cursor offset)
  "Move CURSOR by OFFSET characters relative to its current
position."
  (move-cursor cursor (+ (cursor-position cursor) offset)))

(define (move-cursor-to-end cursor)
  (move-cursor cursor (string-length (cursor-text cursor))))

(define (string->cursor str)
  (make-cursor str 0))

(define (cursor->string cursor)
  (substring (cursor-text cursor) (cursor-position cursor)))


;;;
;;; Tokens
;;;

(define empty-tokens '())

(define (token-add tokens new-token)
  (cons new-token tokens))

(define (token-peek tokens)
  (car tokens))

(define (token-drop tokens n)
  (drop tokens n))

(define (token-take tokens n)
  (take tokens n))

(define (tokens->list tokens)
  ;; Tokens are accumulated as a stack i.e. in reverse order.
  (reverse tokens))


;;;
;;; Lexers
;;;

(define (fail)
  (values #f #f))

(define (lex-fail tokens cursor)
  "Always fail to parse lexemes without advancing CURSOR or altering
TOKENS."
  (fail))

(define (lex-identity tokens cursor)
  (values tokens cursor))

(define (lex-bind proc lexer)
  "Return a lexer that applies the result of LEXER to PROC, a
procedure that returns a lexer, and then applies that new lexer."
  (lambda (tokens cursor)
    (let-values (((result remainder) (lexer tokens cursor)))
      (if result
          ((proc result) remainder)
          (fail)))))

(define (lex-filter predicate lexer)
  "Return a lexer that succeeds when LEXER succeeds and the head of
the tokens queue satisfies PREDICATE."
  (lambda (tokens cursor)
    (let-values (((result remainder) (lexer tokens cursor)))
      (if (and result (predicate (token-peek result)))
          (values result remainder)
          (fail)))))

(define (lex-any* lexers)
  "Return a lexer that succeeds with the result of the first
successful lexer in LEXERS or fails if all lexers fail."
  (define (either a b)
    (lambda (tokens cursor)
      (let-values (((result remainder) (a tokens cursor)))
        (if result
            (values result remainder)
            (b tokens cursor)))))

  (fold-right either lex-fail lexers))

(define (lex-any . lexers)
  "Return a lexer that succeeds with the result of the first
successful lexer in LEXERS or fails if all lexers fail."
  (lex-any* lexers))

(define (lex-all* lexers)
  "Return a lexer that succeeds with the results of all LEXERS in
order, or fails if any lexer fails."
  (define (both a b)
    (lambda (tokens cursor)
      (let-values (((result-a remainder-a) (a tokens cursor)))
        (if result-a
            (let-values (((result-b remainder-b) (b result-a remainder-a)))
              (if result-b
                  (values result-b remainder-b)
                  (fail)))
            (fail)))))

  (fold-right both lex-identity lexers))

(define (lex-all . lexers)
  "Return a lexer that succeeds with the results of all LEXERS in
order, or fails if any lexer fails."
  (lex-all* lexers))

(define (lex-zero-or-more lexer)
  "Create a lexer that uses LEXER as many times as possible until it
fails and return the results of each success in a list.  The lexer
always succeeds."
  (define (lex tokens cursor)
    (let-values (((result remainder) (lexer tokens cursor)))
      (if result
          (lex result remainder)
          (values tokens cursor))))

  lex)

(define (lex-consume lexer)
  "Return a lexer that always succeeds with a list of as many
consecutive successful applications of LEXER as possible, consuming
the entire input text.  Sections of text that could not be lexed are
returned as plain strings."
  (define (substring* cursor start)
    (substring (cursor-text cursor) start (cursor-position cursor)))

  (define (consume tokens cursor)
    (if (cursor-end? cursor)
        (values tokens cursor)
        (let-values (((result remainder) (lexer tokens cursor)))
          (if result
              (consume result remainder)
              (values (token-add tokens (cursor->string cursor))
                      (move-cursor-to-end cursor))))))

  consume)

(define (lex-maybe lexer)
  "Create a lexer that always succeeds, but tries to use LEXER.  If
LEXER succeeds, its result is returned, otherwise the empty string is
returned without consuming any input."
  (lambda (tokens cursor)
    (let-values (((result remainder) (lexer tokens cursor)))
      (if result
          (values result remainder)
          (values tokens cursor)))))

(define (lex-regexp pattern)
  "Return a lexer that succeeds with the matched substring when the
input matches the string PATTERN."
  (let ((rx (make-regexp (string-append "^" pattern))))
    (lambda (tokens cursor)
      (if (cursor-end? cursor)
          (fail)
          (let ((result (regexp-exec rx (cursor-text cursor)
                                     (cursor-position cursor))))
            (if result
                (let ((str (match:substring result 0)))
                  (values (token-add tokens str)
                          (move-cursor-by cursor (string-length str))))
                (fail)))))))

(define (lex-string str)
  "Return a lexer that succeeds with STR when the input starts with
STR."
  (lambda (tokens cursor)
    (if (string-prefix?* str (cursor-text cursor) (cursor-position cursor))
        (values (token-add tokens str)
                (move-cursor-by cursor (string-length str)))
        (fail))))

(define (lex-char-set char-set)
  "Return a lexer that succeeds with the nonempty input prefix that
matches CHAR-SET, or fails if the first input character does not
belong to CHAR-SET."
  (define (char-set-substring str start)
    (let ((len (string-length str)))
      (let loop ((index start))
        (cond
         ((>= index len)
          (substring str start len))
         ((char-set-contains? char-set (string-ref str index))
          (loop (1+ index)))
         (else
          (substring str start index))))))

  (lambda (tokens cursor)
    (match (char-set-substring (cursor-text cursor) (cursor-position cursor))
      ("" (fail))
      (str
       (values (token-add tokens str)
               (move-cursor-by cursor (string-length str)))))))

(define lex-whitespace
  (lex-char-set char-set:whitespace))

(define* (lex-delimited open #:key (until open) (escape #\\) nested?)
  "Return a lexer that succeeds with the string delimited by the
opening string OPEN and the closing string UNTIL.  Characters within
the delimited expression may be escaped with the character ESCAPE.  If
NESTED?, allow for delimited expressions to be arbitrarily nested
within."
  (define (delimit str start)
    (let ((len (string-length str)))
      (let loop ((index start))
        (cond
         ;; Out of bounds.
         ((>= index len)
          len)
         ;; Escape character.
         ((eqv? escape (string-ref str index))
          (loop (+ index 2)))
         ;; Closing delimiter.
         ((string-prefix?* until str index)
          (+ index (string-length until)))
         ;; Nested delimited string.
         ((and nested? (string-prefix?* open str index))
          (loop (delimit str (+ index (string-length open)))))
         (else
          (loop (1+ index)))))))

  (lambda (tokens cursor)
    (let ((str (cursor-text cursor))
          (pos (cursor-position cursor)))
      (if (string-prefix?* open str pos)
          (let ((end (delimit str (+ pos (string-length open)))))
            (values (token-add tokens (substring str pos end))
                    (move-cursor cursor end)))
          (fail)))))

(define (lex-tag tag lexer)
  "Transform the head element of the tokens queue returned by LEXER
into a two element list consisting of the symbol TAG and the element
itself."
  (lambda (tokens cursor)
    (let-values (((result remainder) (lexer tokens cursor)))
      (if result
          (values (token-add (token-drop result 1)
                             (list tag (token-peek result)))
                  remainder)
          (fail)))))