Skip to content

Commit 4690a33

Browse files
committed
Add enough feature to be able to parse clojure.core
- metadata ^ - deref @... - shorthand lambdas #(..) - vars #' - hex numbers - regexes
1 parent af6102c commit 4690a33

File tree

5 files changed

+236
-25
lines changed

5 files changed

+236
-25
lines changed

parseclj-ast.el

+4
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ on available options."
133133
:tag (intern (substring (a-get opening-token :form) 1))
134134
:children children)
135135
stack))
136+
(:metadata (cons (parseclj-ast-node :with-meta
137+
pos
138+
:children children)
139+
stack))
136140
(t (cons
137141
(parseclj-ast-node type pos :children children)
138142
stack)))))

parseclj-lex.el

+108-14
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
:symbol
3737
:keyword
3838
:string
39+
:regex
3940
:character)
4041
"Types of tokens that represent leaf nodes in the AST.")
4142

@@ -44,6 +45,21 @@
4445
:rbrace)
4546
"Types of tokens that mark the end of a non-atomic form.")
4647

48+
(defvar parseclj-lex--prefix-tokens '(:quote
49+
:backquote
50+
:unquote
51+
:unquote-splice
52+
:discard
53+
:tag
54+
:reader-conditional
55+
:reader-conditional-splice
56+
:var
57+
:deref)
58+
"Tokens that modify the form that follows.")
59+
60+
(defvar parseclj-lex--prefix-2-tokens '(:metadata)
61+
"Tokens that modify the two forms that follow.")
62+
4763
;; Token interface
4864

4965
(defun parseclj-lex-token (type form pos &rest attributes)
@@ -81,6 +97,11 @@ A token is an association list with :token-type as its first key."
8197
(and (consp token)
8298
(cdr (assq :token-type token))))
8399

100+
(defun parseclj-lex-token-form (token)
101+
"Get the form of TOKEN."
102+
(and (consp token)
103+
(cdr (assq :form token))))
104+
84105
(defun parseclj-lex-leaf-token-p (token)
85106
"Return t if the given AST TOKEN is a leaf node."
86107
(member (parseclj-lex-token-type token) parseclj-lex--leaf-tokens))
@@ -89,6 +110,9 @@ A token is an association list with :token-type as its first key."
89110
"Return t if the given ast TOKEN is a closing token."
90111
(member (parseclj-lex-token-type token) parseclj-lex--closing-tokens))
91112

113+
(defun parseclj-lex-error-p (token)
114+
"Return t if the TOKEN represents a lexing error token."
115+
(eq (parseclj-lex-token-type token) :lex-error))
92116

93117
;; Elisp values from tokens
94118

@@ -177,18 +201,32 @@ S goes through three transformations:
177201
(<= (char-after (point)) ?9))
178202
(right-char)))
179203

204+
(defun parseclj-lex-skip-hex ()
205+
"Skip all consecutive hex digits after point."
206+
(while (and (char-after (point))
207+
(or (<= ?0 (char-after (point)) ?9)
208+
(<= ?a (char-after (point)) ?f)
209+
(<= ?A (char-after (point)) ?F)))
210+
(right-char)))
211+
180212
(defun parseclj-lex-skip-number ()
181213
"Skip a number at point."
182214
;; [\+\-]?\d+\.\d+
183-
(when (member (char-after (point)) '(?+ ?-))
184-
(right-char))
215+
(if (and (eq ?0 (char-after (point)))
216+
(eq ?x (char-after (1+ (point)))))
217+
(progn
218+
(right-char 2)
219+
(parseclj-lex-skip-hex))
220+
(progn
221+
(when (member (char-after (point)) '(?+ ?-))
222+
(right-char))
185223

186-
(parseclj-lex-skip-digits)
224+
(parseclj-lex-skip-digits)
187225

188-
(when (eq (char-after (point)) ?.)
189-
(right-char))
226+
(when (eq (char-after (point)) ?.)
227+
(right-char))
190228

191-
(parseclj-lex-skip-digits))
229+
(parseclj-lex-skip-digits))))
192230

193231
(defun parseclj-lex-number ()
194232
"Consume a number and return a `:number' token representing it."
@@ -270,22 +308,39 @@ are returned as their own lex tokens."
270308
((equal sym "false") (parseclj-lex-token :false "false" pos))
271309
(t (parseclj-lex-token :symbol sym pos))))))
272310

273-
(defun parseclj-lex-string ()
274-
"Return a lex token representing a string.
275-
If EOF is reached without finding a closing double quote, a :lex-error
276-
token is returned."
311+
(defun parseclj-lex-string* ()
312+
"Helper for string/regex lexing.
313+
Returns either the string, or an error token"
277314
(let ((pos (point)))
278315
(right-char)
279316
(while (not (or (equal (char-after (point)) ?\") (parseclj-lex-at-eof-p)))
280317
(if (equal (char-after (point)) ?\\)
281318
(right-char 2)
282319
(right-char)))
283-
(if (equal (char-after (point)) ?\")
284-
(progn
285-
(right-char)
286-
(parseclj-lex-token :string (buffer-substring-no-properties pos (point)) pos))
320+
(when (equal (char-after (point)) ?\")
321+
(right-char)
322+
(buffer-substring-no-properties pos (point)))))
323+
324+
(defun parseclj-lex-string ()
325+
"Return a lex token representing a string.
326+
If EOF is reached without finding a closing double quote, a :lex-error
327+
token is returned."
328+
(let ((pos (point))
329+
(str (parseclj-lex-string*)))
330+
(if str
331+
(parseclj-lex-token :string str pos)
287332
(parseclj-lex-error-token pos :invalid-string))))
288333

334+
(defun parseclj-lex-regex ()
335+
"Return a lex token representing a regular expression.
336+
If EOF is reached without finding a closing double quote, a :lex-error
337+
token is returned."
338+
(let ((pos (1- (point)))
339+
(str (parseclj-lex-string*)))
340+
(if str
341+
(parseclj-lex-token :regex (concat "#" str) pos)
342+
(parseclj-lex-error-token pos :invalid-regex))))
343+
289344
(defun parseclj-lex-lookahead (n)
290345
"Return a lookahead string of N characters after point."
291346
(buffer-substring-no-properties (point) (min (+ (point) n) (point-max))))
@@ -387,6 +442,22 @@ See `parseclj-lex-token'."
387442
(right-char)
388443
(parseclj-lex-token :rbrace "}" pos))
389444

445+
((equal char ?')
446+
(right-char)
447+
(parseclj-lex-token :quote "'" pos))
448+
449+
((equal char ?`)
450+
(right-char)
451+
(parseclj-lex-token :backquote "'" pos))
452+
453+
((equal char ?~)
454+
(right-char)
455+
(if (eq ?@ (char-after (point)))
456+
(progn
457+
(right-char)
458+
(parseclj-lex-token :unquote-splice "~@" pos))
459+
(parseclj-lex-token :unquote "~" pos)))
460+
390461
((parseclj-lex-at-number-p)
391462
(parseclj-lex-number))
392463

@@ -405,6 +476,14 @@ See `parseclj-lex-token'."
405476
((equal char ?\;)
406477
(parseclj-lex-comment))
407478

479+
((equal char ?^)
480+
(right-char)
481+
(parseclj-lex-token :metadata "^" pos))
482+
483+
((equal char ?@)
484+
(right-char)
485+
(parseclj-lex-token :deref "@" pos))
486+
408487
((equal char ?#)
409488
(right-char)
410489
(let ((char (char-after (point))))
@@ -415,6 +494,21 @@ See `parseclj-lex-token'."
415494
((equal char ?_)
416495
(right-char)
417496
(parseclj-lex-token :discard "#_" pos))
497+
((equal char ?\()
498+
(right-char)
499+
(parseclj-lex-token :lambda "#(" pos))
500+
((equal char ?')
501+
(right-char)
502+
(parseclj-lex-token :var "#'" pos))
503+
((equal char ?\")
504+
(parseclj-lex-regex))
505+
((equal char ?\?)
506+
(right-char)
507+
(if (eq ?@ (char-after (point)))
508+
(progn
509+
(right-char)
510+
(parseclj-lex-token :reader-conditional-splice "#?@" pos))
511+
(parseclj-lex-token :reader-conditional "#?" pos)))
418512
((parseclj-lex-symbol-start-p char t)
419513
(right-char)
420514
(parseclj-lex-token :tag (concat "#" (parseclj-lex-get-symbol-at-point (1+ pos))) pos))

parseclj-parser.el

+31-5
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,11 @@ can be handled with `condition-case'."
4747
(defun parseclj--find-opening-token (stack closing-token)
4848
"Scan STACK for an opening-token matching CLOSING-TOKEN."
4949
(cl-case (parseclj-lex-token-type closing-token)
50-
(:rparen :lparen)
50+
(:rparen (parseclj-lex-token-type
51+
(seq-find (lambda (token)
52+
(member (parseclj-lex-token-type token)
53+
'(:lparen :lambda)))
54+
stack)))
5155
(:rbracket :lbracket)
5256
(:rbrace (parseclj-lex-token-type
5357
(seq-find (lambda (token)
@@ -79,7 +83,7 @@ available options."
7983
(progn
8084
(when fail-fast
8185
;; any unreduced tokens left: bail early
82-
(when-let ((token (seq-find #'parseclj-lex-token-p collection)))
86+
(when-let* ((token (seq-find #'parseclj-lex-token-p collection)))
8387
(parseclj--error "At position %s, unmatched %S"
8488
(a-get token :pos)
8589
(parseclj-lex-token-type token))))
@@ -192,6 +196,11 @@ functions. Additionally the following options are recognized
192196
;; (message "STACK: %S" stack)
193197
;; (message "TOKEN: %S\n" token)
194198

199+
(when (and fail-fast (parseclj-lex-error-p token))
200+
(parseclj--error "Invalid token at %s: %S"
201+
(a-get token :pos)
202+
(parseclj-lex-token-form token)))
203+
195204
;; Reduce based on the top item on the stack (collections)
196205
(cond
197206
((parseclj-lex-leaf-token-p token)
@@ -204,7 +213,7 @@ functions. Additionally the following options are recognized
204213

205214
;; Reduce based on top two items on the stack (special prefixed elements)
206215
(let* ((top-value (parseclj--take-value stack value-p))
207-
(opening-token (parseclj--take-token (nthcdr (length top-value) stack) value-p '(:discard :tag)))
216+
(opening-token (parseclj--take-token (nthcdr (length top-value) stack) value-p parseclj-lex--prefix-tokens))
208217
new-stack)
209218
(while (and top-value opening-token)
210219
;; (message "Reducing...")
@@ -214,14 +223,31 @@ functions. Additionally the following options are recognized
214223
(setq new-stack (nthcdr (+ (length top-value) (length opening-token)) stack))
215224
(setq stack (funcall reduce-branch new-stack (car opening-token) (append (cdr opening-token) top-value) options))
216225

226+
;; recur
217227
(setq top-value (parseclj--take-value stack value-p))
218-
(setq opening-token (parseclj--take-token (nthcdr (length top-value) stack) value-p '(:discard :tag)))))
228+
(setq opening-token (parseclj--take-token (nthcdr (length top-value) stack) value-p parseclj-lex--prefix-tokens))))
229+
230+
;; Reduce based on top three items on the stack (metadata, namespaced maps)
231+
(let* ((top-value-1 (parseclj--take-value stack value-p))
232+
(top-value-2 (parseclj--take-value (nthcdr (length top-value-1) stack) value-p))
233+
(opening-token (parseclj--take-token (nthcdr (+ (length top-value-1)
234+
(length top-value-2)) stack) value-p parseclj-lex--prefix-2-tokens))
235+
new-stack)
236+
(while (and top-value-1 top-value-2 opening-token)
237+
(setq new-stack (nthcdr (apply #'+ (mapcar #'length (list top-value-1 top-value-2 opening-token))) stack))
238+
(setq stack (funcall reduce-branch new-stack (car opening-token) (append (cdr opening-token) top-value-1 top-value-2) options))
239+
240+
;; recur
241+
(setq top-value-1 (parseclj--take-value stack value-p))
242+
(setq top-value-2 (parseclj--take-value (nthcdr (length top-value-1) stack) value-p))
243+
(setq opening-token (parseclj--take-token (nthcdr (+ (length top-value-1)
244+
(length top-value-2)) stack) value-p parseclj-lex--prefix-2-tokens))))
219245

220246
(setq token (parseclj-lex-next)))
221247

222248
;; reduce root
223249
(when fail-fast
224-
(when-let ((token (seq-find #'parseclj-lex-token-p stack)))
250+
(when-let* ((token (seq-find #'parseclj-lex-token-p stack)))
225251
(parseclj--error "At position %s, unmatched %S"
226252
(a-get token :pos)
227253
(parseclj-lex-token-type token))))

test/parseclj-lex-test.el

+26-6
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,25 @@
5252
(:form . "123e34M")
5353
(:pos . 1)))))
5454

55+
(with-temp-buffer
56+
(insert "0xff00AA")
57+
(goto-char 1)
58+
(should (equal (parseclj-lex-next) '((:token-type . :number)
59+
(:form . "0xff00AA")
60+
(:pos . 1)))))
61+
62+
(with-temp-buffer
63+
(insert "#?(:clj 1 :cljs 2)")
64+
(goto-char 1)
65+
(should (equal (parseclj-lex-next)
66+
'((:token-type . :reader-conditional) (:form . "#?") (:pos . 1)))))
67+
68+
(with-temp-buffer
69+
(insert "#?@(:clj [1] :cljs [2])")
70+
(goto-char 1)
71+
(should (equal (parseclj-lex-next)
72+
'((:token-type . :reader-conditional-splice) (:form . "#?@") (:pos . 1)))))
73+
5574
(with-temp-buffer
5675
(insert "123x")
5776
(goto-char 1)
@@ -203,12 +222,7 @@
203222
(should (equal (parseclj-lex-next) (parseclj-lex-token :number "13" 18)))
204223
(should (equal (parseclj-lex-next) (parseclj-lex-token :whitespace " " 20)))
205224
(should (equal (parseclj-lex-next) (parseclj-lex-token :number "14" 21)))
206-
(should (equal (parseclj-lex-next) (parseclj-lex-token :rparen ")" 23))))
207-
208-
(with-temp-buffer
209-
(insert "~")
210-
(goto-char 1)
211-
(should (equal (parseclj-lex-next) (parseclj-lex-token :lex-error "~" 1)))))
225+
(should (equal (parseclj-lex-next) (parseclj-lex-token :rparen ")" 23)))))
212226

213227
(ert-deftest parseclj-lex-test-at-number-p ()
214228
(dolist (str '("123" ".9" "+1" "0" "-456"))
@@ -295,6 +309,12 @@
295309
(goto-char 1)
296310
(should (equal (parseclj-lex-string) (parseclj-lex-token :string "\"abc\\\"\"" 1)))))
297311

312+
(ert-deftest parseclj-lex-test-regex ()
313+
(with-temp-buffer
314+
(insert "#\"abc\"")
315+
(goto-char 1)
316+
(should (equal (parseclj-lex-next) (parseclj-lex-token :regex "#\"abc\"" 1)))))
317+
298318
(ert-deftest parseclj-lex-test-tag ()
299319
(with-temp-buffer
300320
(insert "#inst")

0 commit comments

Comments
 (0)