(* * LANGUAGE : ANS Forth + extensions * PROJECT : Forth Environments * DESCRIPTION : A little language compiler * CATEGORY : TINYKISS language compiler vsn 0.1 * AUTHOR : Jack W. Crenshaw * LAST CHANGE : Saturday, December 31, 2005, 9:03 AM, Marcel Hendrix, added {} comments and ";"s * LAST CHANGE : Friday, December 30, 2005, 23:16 PM, Marcel Hendrix *) NEEDS -miscutil NEEDS -assemble REVISION -tinykiss "ÄÄÄ TINYKISS compiler Version 1.01 ÄÄÄ" DOC (* Based on the TINY/KISS compiler from Jack Crenshaw's excellent non-technical introduction to compiler construction, found on http://compilers.iecc.com/crenshaw/ . *) ENDDOC VARIABLE debug? debug? OFF -- set TRUE for debug in NEXT VARIABLE compile? compile? OFF -- generate code immediately 0 VALUE #lcount -- for error reporting 0 VALUE Look -- Lookahead character 0 VALUE token -- encoded token CREATE token$ 0 C, #256 CHARS ALLOT -- unencoded token CREATE prog$ 0 C, #256 CHARS ALLOT -- program name -- Input --------------------------------------------------------------------------------------------- : char+! ( c addr -- ) DUP >R COUNT + C! 1 R> C+! ; : char-place ( c addr -- ) 1 OVER C! CHAR+ C! ; : token> ( -- c-addr u ) token$ COUNT ; : =token ( char -- tf ) token = ; : <>token ( char -- tf ) token <> ; : getchar ( -- ) EKEY TO Look Look ^M = 1 AND +TO #lcount ; : init.input ( -- ) CR getchar ; -- Output -------------------------------------------------------------------------------------------- CREATE TAB$ 1 C, ^I C, DEFER .OUT ( c-addr u -- ) : init.output ( -- ) compile? @ IF ['] EVALUATE ELSE ['] TYPE ENDIF [IS] .OUT ; init.output : cemits ( c-addr u -- ) .OUT ; -- output a string : emits ( c-addr u -- ) TAB$ COUNT .OUT .OUT ; -- output a string with tab : emitln ( c-addr u -- ) CR$ COUNT .OUT emits ; -- output a string with tab and crlf : emitdln ( c-addr u -- ) CR$ COUNT .OUT .OUT ; -- output a string without tab + crlf : emit2ln ( c-addr u -- ) emitdln CR$ COUNT .OUT ; -- output a string without tab + crlfs : emit2lns ( c-addr u -- ) emit2ln TAB$ COUNT .OUT ; -- output a string without tab + crlfs -- Errors -------------------------------------------------------------------------------------------- -- report an error : errors ( c-addr u -- ) CR Bell ." Error on line " #lcount 0DEC.R ." , " TYPE ." , seen `" token> TYPE &' EMIT ; : init.error ( -- ) CLEAR #lcount ; -- report line with error : aborts ( c-addr u -- ) errors ABORT ; -- report error and halt : expected ( c-addr u -- ) S" expected" $+ aborts ; -- report what was expected -- Abort, reporting an offending item : $aborts ( $1 $2 -- ) &' CHAR-APPEND S" `" 2SWAP $+ $+ aborts ; : .undefined ( c-addr u -- ) S" Undefined identifier" 2SWAP $aborts ; : .duplicate ( c-addr u -- ) S" Duplicate identifier" 2SWAP $aborts ; -- Symbol tables and labels -------------------------------------------------------------------------- 0 VALUE lbase -- local parameter counter #100 =: maxentry -- Helper to build symbol tables : $, ( c-addr u size -- ) >S S MIN >R R@ C, R@ 0 ?DO C@+ C, LOOP DROP S> R> - CHARS ALLOT ; -- Type declarations : SYMTAB ( size -- ) CREATE DUP 0< IF 0 , ABS DUP , 1+ maxentry * ALLOT ELSE HERE >R 0 , ( #items ) DUP >S , ( itemsize ) BEGIN BL DUP WHILE 2DUP S" \" COMPARE 0= IF 2DROP REFILL DROP ELSE S $, 1 R@ +! ENDIF REPEAT 2DROP -R -S ENDIF DOES> CELL+ @+ 1+ ROT * + ( ix -- addr ) ; -- Definition of keywords and token types #15 CHARS =: /symbol #15 CHARS =: /param /symbol SYMTAB KWlist IF ELSE ENDIF \ WHILE ENDWHILE \ DO ENDDO \ LOOP ENDLOOP \ REPEAT UNTIL \ FOR TO ENDFOR \ BREAK \ READ WRITE \ VAR END \ PROCEDURE \ PROGRAM : KW->token ( kw_index -- ) 2+ C" xilewedeLerufteBRWvepP" + C@ TO token ; /symbol NEGATE SYMTAB SymT =CELL 1- NEGATE SYMTAB SymType /param NEGATE SYMTAB Params =CELL 1- NEGATE SYMTAB ParType 0 SymT 2 CELLS - =: [cnt]SymT 0 Params 2 CELLS - =: [cnt]Params 0 SymType 2 CELLS - =: [cnt]SymType 0 ParType 2 CELLS - =: [cnt]ParType : init.sym ( -- ) [cnt]SymT 0! [cnt]SymType 0! ; : lookup ( c-addr u 'table -- n2|-1 ) 0 0 LOCALS| /symbol n table sz addr | table 2 CELLS - @+ TO n @ TO /symbol n 0<= IF -1 EXIT ENDIF 0 n DO /symbol 1+ I * table + COUNT addr sz COMPARE 0= IF I UNLOOP EXIT ENDIF -1 +LOOP -1 ; -- Locate symbol in table : locate ( c-addr u -- x ) 0 SymT lookup ; -- Returns -1 | the index of the entry. : intable ( c-addr u -- f ) 0 SymT lookup 0>= ; -- Look for symbol in table : clearparams ( -- ) [cnt]Params 0! [cnt]ParType 0! ; -- Clear the parameter table : paramnumber ( c-addr u -- u ) 0 Params lookup ; -- Get parameter's index ( 1 ... ) : param? ( c-addr u -- f ) paramnumber 0> ; -- See if an identifier is a parameter -- Get the type of a symbol or parameter : typeof ( c-addr u -- char ) 2DUP param? IF 2DROP 'f' EXIT ENDIF ( In principle, 'F' could be returned ) 2DUP locate DUP 0< IF DROP .undefined ENDIF NIP NIP SymType C@ ; : checktable ( c-addr u -- ) 2DUP intable IF 2DROP EXIT ENDIF .undefined ; -- Is identifier in the symbol table? : checkdup ( c-addr u -- ) 2DUP intable 0= IF 2DROP EXIT ENDIF .duplicate ; -- Is identifier already in symbol table? : checkident ( -- ) 'x' <>token IF S" Identifier" expected ENDIF ; -- Is current token an identifier? -- Add a new entry to the symbol table : addentry ( c-addr u typ -- ) [cnt]SymT LOCALS| #entries T sz addr | addr sz checkdup #entries @ maxentry = IF S" Symbol table full" aborts ENDIF 1 #entries +! addr sz #entries @ SymT PACK DROP T #entries @ SymType C! ; -- Add a new item to the parameter table : addparam ( c-addr u typ -- ) [cnt]Params LOCALS| #entries T sz addr | addr sz param? IF addr sz .duplicate ENDIF #entries @ maxentry = IF S" Parameter table full" aborts ENDIF 1 #entries +! ( base is 1 ) addr sz #entries @ Params PACK DROP T #entries @ ParType ! ; : checkvar ( c-addr u -- v|V ) 2DUP intable 0= IF .undefined ENDIF 2DUP typeof DUP 'v' <> OVER 'V' <> AND IF DROP S" is not a global variable" $aborts ENDIF NIP NIP ; : checkpar ( index -- f|F ) DUP ParType @ DUP 'f' <> OVER 'F' <> AND IF DROP (.) S" reference to non-local variable #" 2SWAP $+ aborts ENDIF NIP ; -- Dump symbol table : .symbols ( -- ) [cnt]SymT @ 0= IF CR ." No symbols defined." EXIT ENDIF CR ." -- type --.--- name ---" [cnt]SymT @ 0 ?DO CR 5 HTAB I 1+ SymType C@ EMIT #16 HTAB I 1+ SymT .$ LOOP ; -- Code generation ----------------------------------------------------------------------------------- S" codegen.frt" INCLUDED -- Scanning ------------------------------------------------------------------------------------------ WARNING @ WARNING OFF : digit? ( char -- tf ) '0' '9' 1+ WITHIN ; -- recognize a numeric digit WARNING ! : alpha? ( char -- tf ) >UPC 'A' 'Z' 1+ WITHIN ; -- recognize an alpha character : alnum? ( char -- tf ) DUP alpha? SWAP digit? OR ; -- recognize alphanumeric : orop? ( char -- tf ) DUP '|' = SWAP '~' = OR ; -- recognize an OR operand : mulop? ( char -- tf ) DUP '*' = SWAP '/' = OR ; -- test for MulOp : addop? ( char -- tf ) DUP '+' = SWAP '-' = OR ; -- test for AddOp -- Recognize a Relop : relop? ( char -- tf ) DUP '=' = OVER '<' = OR SWAP '>' = OR ; -- match a specific input character : match ( char -- ) DUP Look = IF DROP getchar EXIT ENDIF S" `" ROT CHAR-APPEND &' CHAR-APPEND expected ; -- Recognize white space : white? ( char -- tf ) DUP Tab = OVER BL = OR OVER '{' = OR OVER ^M = OR SWAP ^J = OR ; -- Skip a comment field : skipcomment RECURSIVE ( -- ) BEGIN Look '}' <> WHILE getchar Look '{' = IF skipcomment ENDIF REPEAT getchar ; -- Skip white space : skipwhite ( -- ) BEGIN Look white? WHILE Look '{' = IF skipcomment ELSE getchar ENDIF REPEAT ; -- Get an identifier : getname ( -- ) skipwhite Look alpha? 0= IF S" Identifier" expected ENDIF 'x' TO token token$ C0! BEGIN Look token$ char+! getchar Look alnum? 0= UNTIL ; -- Get a number : getnumber ( -- ) skipwhite Look digit? 0= IF S" Number" expected ENDIF '#' TO token token$ C0! BEGIN Look token$ char+! getchar Look digit? 0= UNTIL ; -- Get an operator : getop ( -- ) skipwhite Look TO token Look token$ char-place getchar ; -- Get next token of any type. Does NOT test if it is a keyword. : (next) ( -- ) skipwhite Look alpha? IF getname EXIT ENDIF Look digit? IF getnumber EXIT ENDIF getop ; : nextt debug? @ IF CR ." next[" ENDIF (next) debug? @ 0= ?EXIT ." ]next token=`" Token EMIT ." ', token$=`" token> TYPE &' EMIT .S ; -- Match a semicolon and advance if it is found. : ?semi ( -- ) ';' =token IF nextt ENDIF ; -- Test if the CURRENT token is a keyword. : iscan ( -- ) 'x' =token IF token> $>UPC 0 KWlist lookup KW->token ENDIF ; -- Match a specific input string and advance if it's there. : matchstring ( c-addr u -- ) token> $>UPC 2OVER COMPARE 0= IF 2DROP nextt EXIT ENDIF &` CHAR-PREPEND &' CHAR-APPEND expected ; -- Match a specific input character and advance if it's there. : matchchar ( char -- ) DUP token$ CHAR+ C@ >UPC = IF DROP nextt EXIT ENDIF S" `" ROT CHAR-APPEND &' CHAR-APPEND expected ; -- Expressions --------------------------------------------------------------------------------------- DEFER boolexpression -- Parse and translate a math factor : factor ( -- ) '(' =token IF nextt boolexpression ')' matchchar EXIT ENDIF 'x' =token IF token> param? IF token> paramnumber _parameter@ ELSE token> _variable@ ENDIF nextt EXIT ENDIF '^' =token IF nextt token> param? IF token> paramnumber _paraddr ELSE token> _varaddr ENDIF nextt EXIT ENDIF '#' =token IF token> _constant@ nextt EXIT ENDIF S" Math factor" expected ; -- Recognize and translate multiply / divide : multiply ( -- ) nextt factor _popmul ; : divide ( -- ) nextt factor _popdiv ; -- Parse and translate a math term : term ( -- ) factor BEGIN token mulop? WHILE _push CASE token '*' OF multiply ENDOF '/' OF divide ENDOF ENDCASE REPEAT ; -- Recognize and translate add / subtract : add ( -- ) nextt term _popadd ; : subtract ( -- ) nextt term _popsub ; -- Parse and translate an expression : expression ( -- ) token addop? IF _clear ELSE term ENDIF BEGIN token addop? WHILE _push CASE token '+' OF add ENDOF '-' OF subtract ENDOF ENDCASE REPEAT ; -- Get next expression : nextx ( -- ) nextt expression ; -- Recognize and translate a relational "equals" / "less or equal" / "not equals" : equals ( -- ) nextx _pcmpe ; : lessorequal ( -- ) nextx _pcmple ; : notequals ( -- ) nextx _pcmpne ; -- Recognize and translate a relational "less than" : less ( -- ) nextt CASE token '=' OF lessorequal ENDOF '>' OF notequals ENDOF expression _pcmpl ENDCASE ; -- Recognize and translate a relational "greater than" : greater ( -- ) nextt '=' =token IF nextx _pcmpge EXIT ENDIF expression _pcmpg ; -- Parse and translate a relation : relation ( -- ) expression token relop? 0= ?EXIT _push CASE token '=' OF equals ENDOF '<' OF less ENDOF '>' OF greater ENDOF ENDCASE ; -- Parse and translate a boolean factor with NOT : notfactor ( -- ) '!' <>token IF relation EXIT ENDIF nextt relation _not ; -- Parse and translate a boolean term : boolterm ( -- ) notfactor BEGIN '&' =token WHILE _push nextt notfactor _popand REPEAT ; -- Recognize and translate a boolean OR / XOR : boolOR ( -- ) nextt boolterm _popor ; : boolXOR ( -- ) nextt boolterm _popxor ; -- Parse and translate a boolean expression :NONAME ( -- ) boolterm BEGIN token orop? WHILE _push CASE token '|' OF boolOR ENDOF '~' OF boolXOR ENDOF ENDCASE REPEAT ; IS boolexpression -- Parse and translate an assignment statement : assignment ( c-addr u -- ) DUP 1+ ALLOCATE ?ALLOCATE DUP LOCAL name PACK DROP nextt '=' matchchar boolexpression name COUNT param? IF name COUNT paramnumber _parameter! ELSE name COUNT _variable! ENDIF name FREE ?ALLOCATE ; -- Process an actual parameter : param ( -- ) expression _push ; -- Process the actual parameter list for a procedure call : paramlist ( -- #bytes ) 0 LOCAL N nextt '(' matchchar ')' <>token IF param 1 +TO N BEGIN ',' =token WHILE nextt param 1 +TO N REPEAT ENDIF ')' matchchar N CELLS ; -- We need a temporary string here because paramlist calls expression, -- which can overwrite token$ : callproc ( c-addr u -- ) DUP 1+ ALLOCATE ?ALLOCATE DUP >S PACK DROP paramlist >R S COUNT _callw R> _cleanstack S> FREE ?ALLOCATE ; -- Decide if a statement is an assignment or procedure call : assignorproc ( -- ) CASE token> typeof 0 OF token> .undefined ENDOF 'v' OF token> assignment ENDOF 'V' OF token> assignment ENDOF 'f' OF token> assignment ENDOF 'F' OF token> assignment ENDOF ( never should appear ) 'p' OF token> callproc ENDOF S" Identifier `" token> $+ S" ' cannot be used here" $+ aborts ENDCASE ; -- Block statements ---------------------------------------------------------------------------------- DEFER pblock : doblock ( -- ) BEGIN iscan 'e' <>token '.' <>token AND WHILE -1 pblock REPEAT ; : beginblock ( -- ) S" BEGIN" matchstring doblock S" END" matchstring ; : doMain ( -- ) S" PROGRAM" matchstring token> prog$ PACK DROP nextt ?semi _prolog beginblock _epilog ; -- Recognize and translate an IF construct : doIF ( label -- ) -1 -1 LOCALS| L2 L1 L | nextt boolexpression assembly? IF newlabel TO L1 L1 TO L2 L1 _branch0 ELSE S" IF " emit2lns ENDIF L pblock 'l' =token IF nextt assembly? IF newlabel TO L2 L2 _branch L1 postlabel ELSE S" ELSE " emit2lns ENDIF L pblock ENDIF assembly? IF L2 postlabel ELSE S" ENDIF " emitdln ENDIF S" ENDIF" matchstring ; -- Recognize and translate a WHILE construct : doWHILE ( -- ) -1 -1 LOCALS| L2 L1 | assembly? IF newlabel TO L1 newlabel TO L2 nextt L1 postlabel ELSE S" BEGIN " emitdln nextt ENDIF boolexpression assembly? IF L2 _branch0 ELSE S" WHILE " emitdln ENDIF L2 pblock S" ENDWHILE" matchstring assembly? IF L1 _branch L2 postlabel ELSE S" REPEAT " emitdln ENDIF ; -- Parse and translate a LOOP statement : doLOOP ( -- ) -1 -1 LOCALS| L2 L1 | assembly? IF newlabel TO L1 newlabel TO L2 ELSE S" BEGIN " emitdln ENDIF nextt assembly? IF L1 postlabel L2 pblock L1 _branch L2 postlabel ELSE L2 pblock S" AGAIN " emitdln ENDIF S" ENDLOOP" matchstring ; -- Parse and translate a REPEAT statement : doREPEAT ( -- ) -1 -1 LOCALS| L2 L1 | assembly? IF newlabel TO L1 newlabel TO L2 nextt L1 postlabel ELSE S" BEGIN " emitdln nextt ENDIF L2 pblock S" UNTIL" matchstring boolexpression assembly? IF L1 _branch0 L2 postlabel ELSE S" UNTIL" emitdln ENDIF ; -- Parse and translate a FOR statement : doFOR ( -- ) -1 -1 -1 LOCALS| name L2 L1 | nextt checkident token> checktable token> DUP 1+ ALLOCATE ?ALLOCATE DUP TO name PACK DROP newlabel TO L1 newlabel TO L2 nextt '=' matchchar expression _decr name COUNT _variable! S" TO" matchstring expression _push L1 postlabel name COUNT _variable@ _incr name COUNT _variable! L2 _pcmp+b0> L2 pblock L1 _branch L2 postlabel S" ENDFOR" matchstring _incSP name FREE ?ALLOCATE ; -- Parse and translate a DO Statement : doDO ( -- ) -1 -1 LOCALS| L2 L1 | newlabel TO L1 newlabel TO L2 nextt expression L1 postlabel _push L2 pblock _pop _decr L1 _branch<>0 _decSP L2 postlabel S" ENDDO" matchstring _incSP ; -- Recognize and translate a BREAK : doBREAK ( label -- ) DUP 0< IF DROP S" No loop to break from" aborts ENDIF assembly? IF _branch ELSE DROP S" BREAK" emitdln ENDIF nextt ; -- Read a single variable : readvar ( -- ) checkident token> checktable token> _readit nextt ; -- Process a read statement : doread ( -- ) nextt '(' matchchar readvar BEGIN ',' =token WHILE nextt readvar REPEAT ')' matchchar ; -- Process a write statement : dowrite ( -- ) nextt '(' matchchar expression _writeit BEGIN ',' =token WHILE nextt expression _writeit REPEAT ')' matchchar ; -- Recognize and translate a statement block :NONAME ( label -- ) LOCAL L BEGIN iscan 'e' <>token 'l' <>token AND 'u' <>token AND WHILE CASE token 'i' OF L doIF ENDOF 'w' OF doWHILE ENDOF 'd' OF doDO ENDOF 'L' OF doLOOP ENDOF 'r' OF doREPEAT ENDOF 'f' OF doFOR ENDOF 'B' OF L doBREAK ENDOF 'R' OF doread ENDOF 'W' OF dowrite ENDOF assignorproc ENDCASE ?semi REPEAT ; IS pblock -- Declarations -------------------------------------------------------------------------------------- -- Process a formal parameter : formalparam ( -- ) '@' =token IF 'F' nextt ELSE 'f' ENDIF token> ROT addparam ; -- Process the formal parameter list for a procedure call : formallist ( -- ) nextt '(' matchchar ')' <>token IF formalparam BEGIN nextt ',' =token WHILE nextt formalparam REPEAT ENDIF ')' matchchar [cnt]Params @ TO lbase 2 [cnt]Params +! ; -- Parse and translate a local data declaration : locdecl ( -- ) token> 'f' addparam nextt ; -- Parse and translate the local declarations : (locdecls) ( n1 -- n2 ) iscan 'v' <>token ?EXIT BEGIN nextt locdecl 1+ ',' <>token UNTIL ?semi ; : locdecls ( -- n ) 0 BEGIN (locdecls) iscan 'v' <>token UNTIL ; -- Parse and translate a procedure declaration. : doProc ( -- ) nextt token> DUP 1+ ALLOCATE ?ALLOCATE PACK >R R@ COUNT 'p' addentry formallist locdecls ( #formals ) R@ COUNT _procprolog R> FREE ?ALLOCATE beginblock _procepilog clearparams ; -- Allocate storage for a variable. -- (**) copies token> into a temporary so that nextt doesn't overwrite it. -- A static string is safe because assignment can not call expression. : alloc ( -- ) '@' =token IF 'V' nextt ELSE 'v' ENDIF token> DUP 1+ ALLOCATE ?ALLOCATE DUP >S PACK DROP 'x' <>token IF S" Variable name" expected ENDIF S COUNT ROT addentry nextt '=' =token IF '=' matchchar '-' =token IF '-' matchchar S" -" token> $+ ( ** ) ELSE token> BL CHAR-APPEND ( ** ) ENDIF ( c-addr u ) nextt ELSE S" 0" ENDIF ( c-addr u ) S COUNT allocatestorage S> FREE ?ALLOCATE ; : decl ( -- ) nextt alloc BEGIN ',' =token WHILE nextt alloc REPEAT ; -- Parse and translate the global declarations : topdecls ( -- ) BEGIN '.' <>token WHILE CASE iscan token 'v' OF decl ENDOF 'p' OF doProc ENDOF 'P' OF doMain ENDOF S" Unrecognized keyword" token> $aborts ENDCASE ?semi REPEAT ; -- Parse and translate a program --------------------------------------------------------------------- -- Initialize everything : init ( -- ) init.output init.code init.error init.sym clearparams _header init.input nextt ; : TINYKISS ( -- ) init topdecls ; : TK ( -- ) TINYKISS ; :ABOUT CR ." Try: tinykiss -- compile text" CR ." or: tk -- compile text" CR ." Also: .SYMBOLS -- dump symbol table" CR ." Try: compiled? OFF | ON -- list / compile" CR ." See the documentation for a BNF." CR CR ." Example input:" CR CR ." var a=22,b=-2;" CR CR ." procedure foo(@x,y)" CR ." begin a=x+y end;" CR CR ." var c=111;" CR CR ." program test; { a test { obviously } }" CR ." begin" CR ." if a&!b|!a&b" CR ." foo(^a, a+b); CR ." write(a*2+b,b+3/a,c)" CR ." endif" CR ." end." CR ; DEPRIVE .ABOUT -tinykiss CR (* End of Source *)