From 1259d6fb38873b82e10e634b910f9bc4036c8ad9 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Fri, 12 Aug 2022 23:19:54 -0400 Subject: Step 7: Heap Allocation This one took me a looooong time to figure out. --- compiler.scm | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 216 insertions(+), 29 deletions(-) diff --git a/compiler.scm b/compiler.scm index 4305f2c..ab76e20 100644 --- a/compiler.scm +++ b/compiler.scm @@ -15,17 +15,21 @@ (define boolean-shift 7) (define boolean-tag 31) (define empty-list 47) ;; #b00101111 +;; 3 bit tags for heap allocated values. +(define pair-tag 1) +(define vector-tag 2) +(define string-tag 3) (define (immediate-rep x) - (cond - ((integer? x) - (ash x fixnum-shift)) - ((char? x) - (logior (ash (char->integer x) char-shift) char-tag)) - ((boolean? x) - (logior (ash (if x 1 0) boolean-shift) boolean-tag)) - ((null? x) - empty-list))) + (cond + ((integer? x) + (ash x fixnum-shift)) + ((char? x) + (logior (ash (char->integer x) char-shift) char-tag)) + ((boolean? x) + (logior (ash (if x 1 0) boolean-shift) boolean-tag)) + ((null? x) + empty-list))) (define (immediate? x) (or (integer? x) @@ -43,18 +47,15 @@ (define (primcall? x) (and (pair? x) (memq (first x) - '(add1 - sub1 - integer->char - char->integer - zero? - null? - integer? - char? - boolean? + '(add1 sub1 + integer->char char->integer + zero? null? integer? char? boolean? + - * = < <= > >= - eq? char=?)))) + eq? char=? + cons car cdr + make-vector vector vector-length vector-ref vector-set! + make-string string string-length string-ref string-set!)))) (define (if? x) (and (pair? x) @@ -69,6 +70,9 @@ (define (primcall-operand2 x) (third x)) +(define (primcall-operand3 x) + (fourth x)) + (define (lookup name env) (or (assq-ref env name) (error "unbound variable:" name))) @@ -105,12 +109,17 @@ (format #f "$~a" value)) (define rax (register "rax")) +(define rbx (register "rbx")) (define rsp (register "rsp")) +(define rsi (register "rsi")) (define al (register "al")) (define (offset n register) (format #f "~a(~a)" n register)) +(define (register-offset base-register index-register) + (format #f "(~a, ~a)" base-register index-register)) + (define (emit template-string . args) (display " ") (apply format #t template-string args) @@ -119,9 +128,15 @@ (define (emit-mov src dest) (emit "mov ~a, ~a" src dest)) +(define (emit-movb src dest) + (emit "movb ~a, ~a" src dest)) + (define (emit-and mask dest) (emit "and ~a, ~a" mask dest)) +(define (emit-andq mask dest) + (emit "andq ~a, ~a" mask dest)) + (define (emit-or addend dest) (emit "or ~a, ~a" addend dest)) @@ -257,6 +272,167 @@ (emit-comparison (primcall-operand1 x) (primcall-operand2 x) emit-sete si env)) ((char=?) (emit-comparison (primcall-operand1 x) (primcall-operand2 x) emit-sete si env)) + ((cons) + (emit-expr (primcall-operand2 x) si env) ; eval cdr + (emit-mov rax (offset si rsp)) ; save car to stack + (emit-expr (primcall-operand1 x) (- si wordsize) env) ; eval car + (emit-mov rax (offset 0 rsi)) ; move car onto heap + (emit-mov (offset si rsp) rax) ; copy cdr from the stack + (emit-mov rax (offset wordsize rsi)) ; move cdr onto heap + (emit-mov rsi rax) ; heap pointer is the value returned + (emit-or (immediate pair-tag) rax) ; set tag + (emit-add (immediate (* wordsize 2)) rsi)) ; bump heap pointer + ((car) + (emit-expr (primcall-operand1 x) si env) + ;; We have to untag the pair to get the pointer to the 'car'. + ;; The pair tag is 1 so simply subtracting 1 gets us the pointer. + (emit-mov (offset -1 rax) rax)) + ((cdr) + (emit-expr (primcall-operand1 x) si env) + ;; Again, the value is the pointer incremented by 1, so to get to + ;; the cdr we need to jump ahead one word minus 1 byte. + (emit-mov (offset (- wordsize 1) rax) rax)) + ((make-vector) + (emit-expr (primcall-operand1 x) si env) + ;; Wouldn't it be better to save the length untagged so that + ;; vector-ref and vector-set! don't have untag it and + ;; vector-length just needs to retag it? + (emit-mov rax (offset 0 rsi)) ; save length onto heap + (emit-mov rax rbx) ; save length in another register + (emit-mov rsi rax) ; copy heap pointer + (emit-or (immediate vector-tag) rax) ; set tag + ;; Align to next two-word object boundary. I had to add an + ;; additional shift instruction compared to what the paper did to + ;; accommodate the difference in word size (the paper uses 4 byte + ;; words, I'm using 8 byte words) which makes me wonder: For a 64 + ;; bit compiler, should the fixnum tag be 3 bits instead of 2? + (emit-sal (immediate 1) rbx) + (emit-add (immediate (* 2 wordsize)) rbx) + (emit-and (immediate (- (* 2 wordsize))) rbx) + (emit-add rbx rsi)) ; bump heap pointer by length of vector + ((vector) + (let ((items (cdr x))) + ;; Eval all vector items and save them to stack locations. + ;; It's important that we eval all items first, and not copy to + ;; the heap as we go, because any sub-expression that also does + ;; heap allocation will corrupt the heap space we think we have + ;; all to ourselves here. + (let loop ((items items) + (si si)) + (unless (null? items) + (emit-expr (car items) si env) ; eval item + (emit-mov rax (offset si rsp)) ; save to stack + (loop (cdr items) (- si wordsize)))) + ;; Save length onto heap (tagged as immediate int) + (emit-mov (immediate (ash (length items) 2)) rax) + (emit-mov rax (offset 0 rsi)) + ;; Copy items from the stack to the vector. + (let loop ((items items) + (si si) + (heap-offset wordsize)) + (unless (null? items) + (emit-mov (offset si rsp) rax) ; copy from stack + (emit-mov rax (offset heap-offset rsi)) ; save to heap + (loop (cdr items) (- si wordsize) (+ heap-offset wordsize)))) + (emit-mov rsi rax) ; copy heap pointer + (emit-or (immediate vector-tag) rax) ; set tag + ;; Align heap pointer to next available 2 word boundary. + (emit-add (immediate (logand (* (+ (length items) 2) wordsize) + (- (* 2 wordsize)))) + rsi))) + ((vector-length) + (emit-expr (primcall-operand1 x) si env) ; get vector pointer + (emit-sub (immediate vector-tag) rax) ; untag vector + (emit-mov (offset 0 rax) rax)) ; the first word contains the length + ((vector-ref) + (emit-expr (primcall-operand2 x) si env) ; get index arg + (emit-shr (immediate fixnum-shift) rax) ; untag it + (emit-add (immediate 1) rax) ; first word is the length so skip over it + (emit-imul (immediate wordsize) rax) ; scale by word size + (emit-mov rax rbx) ; save index to another register + (emit-expr (primcall-operand1 x) si env) ; get vector pointer + (emit-sub (immediate vector-tag) rax) ; untag vector + (emit-mov (register-offset rbx rax) rax)) ; get element + ((vector-set!) + (emit-expr (primcall-operand1 x) si env) ; get vector pointer + (emit-sub (immediate vector-tag) rax) ; untag vector + (emit-mov rax rbx) ; save vector + (emit-expr (primcall-operand2 x) si env) ; get index + (emit-shr (immediate fixnum-shift) rax) ; untag it + (emit-add (immediate 1) rax) ; first word is the length so skip over it + (emit-imul (immediate wordsize) rax) ; scale by word size + (emit-add rax rbx) ; advance pointer to element being set + (emit-expr (primcall-operand3 x) si env) ; get value + (emit-mov rax (offset 0 rbx))) + ((make-string) + (emit-expr (primcall-operand1 x) si env) + (emit-mov rax (offset 0 rsi)) ; save length onto heap + (emit-mov rax rbx) ; save length in another register + (emit-mov rsi rax) ; write to heap + (emit-or (immediate string-tag) rax) ; set tag + (emit-shr (immediate fixnum-shift) rbx) ; untag length + ;; Align to next two-word object boundary, keeping in mind that + ;; we need one additional word to store the length. Since we're + ;; only storing ASCII characters in this simple compiler, we only + ;; need 1 byte per character. + (emit-add (immediate (- (* 3 wordsize) 1)) rbx) + (emit-and (immediate (- (* 2 wordsize))) rbx) + (emit-add rbx rsi)) ; bump heap pointer by length of string + ((string) + (let ((chars (cdr x))) + ;; Save length onto heap (tagged as immediate int) + (emit-mov (immediate (ash (length chars) 2)) rax) + (emit-mov rax (offset 0 rsi)) + ;; Add chars to string, one byte per item since we're only + ;; covering the ASCII character set. + (let loop ((chars chars) + (heap-offset 1)) + (unless (null? chars) + (emit-expr (first chars) si env) ; eval arg (should be char) + (emit-shr (immediate char-shift) rax) ; untag char + (emit-andq (immediate -255) (offset heap-offset rsi)) ; clear LSB + (emit-or rax (offset heap-offset rsi)) ; set char + (loop (cdr chars) (+ heap-offset 1)))) + (emit-mov rsi rax) ; copy heap pointer + (emit-or (immediate string-tag) rax) ; set tag + ;; Align heap pointer to next available 2 word boundary. + (emit-add (immediate (logand (+ (length chars) (- (* 3 wordsize) 1)) + (- (* 2 wordsize)))) + rsi))) + ((string-length) + (emit-expr (primcall-operand1 x) si env) ; get string pointer + (emit-sub (immediate string-tag) rax) ; untag string + (emit-mov (offset 0 rax) rax)) ; the first word contains the length + ((string-ref) + (emit-expr (primcall-operand2 x) si env) ; get index arg + (emit-shr (immediate fixnum-shift) rax) ; untag it + ;; The first word of a string contains the length, however we + ;; don't want to advance the pointer by a word because each + ;; character is only a single byte. Instead, we advance the + ;; pointer by a single byte so that the character we want to + ;; access is in the least significant bit (LSB) section of the + ;; word. From there, its a simple matter of masking out + ;; everything but the LSB to isolate the character. + (emit-add (immediate 1) rax) + (emit-mov rax rbx) ; save index to another register + (emit-expr (primcall-operand1 x) si env) ; get string pointer + (emit-sub (immediate string-tag) rax) ; untag string + (emit-mov (register-offset rbx rax) rax) ; get char into LSB position + (emit-and (immediate 255) rax) ; clear out everything but the LSB + (emit-sal (immediate char-shift) rax) ; tag char + (emit-or (immediate char-tag) rax)) + ((string-set!) + (emit-expr (primcall-operand1 x) si env) ; get string pointer + (emit-sub (immediate string-tag) rax) ; untag string + (emit-mov rax rbx) ; save string pointer + (emit-expr (primcall-operand2 x) si env) ; get index arg + (emit-shr (immediate fixnum-shift) rax) ; untag it + (emit-add (immediate 1) rax) + (emit-add rax rbx) ; get char into LSB position + (emit-expr (primcall-operand3 x) si env) ; get value + (emit-shr (immediate char-shift) rax) ; untag char + (emit-andq (immediate -255) (offset 0 rbx)) ; clear LSB + (emit-or rax (offset 0 rbx))) ; copy char (else (error "unknown primcall op" (primcall-op x))))) @@ -308,7 +484,7 @@ (emit "ret"))) (define (compile-and-run x) - (with-output-to-file "scheme_entry.S" + (with-output-to-file "scheme_entry.s" (lambda () (display ".text .p2align 4 @@ -317,17 +493,21 @@ scheme_entry: ") (compile-program x))) - (and (zero? (system* "gcc" "-c" "scheme_entry.S")) - (zero? (system* "gcc" "-c" "test.c")) - (zero? (system* "gcc" "-o" "test" "scheme_entry.o" "test.o")) - (let* ((pipe (open-pipe* OPEN_READ "./test")) - (output (read-line pipe))) - (close pipe) - output))) + (unless (zero? (system* "gcc" "-c" "scheme_entry.s")) + (error "failed to compile scheme_entry.s")) + (unless (zero? (system* "gcc" "-c" "test.c")) + (error "failed to compile test.c")) + (unless (zero? (system* "gcc" "-o" "test" "scheme_entry.o" "test.o")) + (error "failed to link program")) + (let* ((pipe (open-pipe* OPEN_READ "./test")) + (output (read-line pipe))) + (close pipe) + output)) (define (test-case x expected-output) (let ((result (compile-and-run x))) - (if (string=? result expected-output) + (if (and (not (eof-object? result)) + (string=? result expected-output)) #t (begin (display "expected: ") @@ -383,4 +563,11 @@ scheme_entry: (test-case '(let ((x 1)) x) "1") (test-case '(let ((x 1) (y 2)) (+ x y)) "3") (test-case '(if #t 1 2) "1") - (test-case '(if #f 1 2) "2")) + (test-case '(if #f 1 2) "2") + (test-case '(car (cons 10 20)) "10") + (test-case '(cdr (cons 10 20)) "20") + (test-case '(car (cdr (cons 1 (cons 2 '())))) "2") + (test-case '(vector-length (make-vector 3)) "3") + (test-case '(vector-ref (vector 1 2 3) 1) "2") + (test-case '(string-length (make-string 5)) "5") + (test-case '(string-ref (string #\a #\b #\c) 1) "b")) -- cgit v1.2.3