( * * LANGUAGE : ANS Forth * PROJECT : Forth Environments * DESCRIPTION : Matrix Multiplication * CATEGORY : Benchmark * AUTHOR : Mark Smotherman * LAST CHANGE : June 12, 2000, Marcel Hendrix * ) false constant ndp? \ -- flag ; true if NDP stack version c" c:\products\PfwVfx" setmacro VfxDir c" c:\products\PfwVfx\lib" setmacro LibDir c" c:\products\VfSfp\FSL\library" setmacro FslDir c" c:\products\VfSfp\Hfp" setmacro NdpDir ndp? [if] S" %NdpDir%\Ndp387" INCLUDED [else] S" %NdpDir%\Hfp387" INCLUDED [then] char . dp-char ! \ select ANS number conversion char . fp-char ! -short-branches \ disable short forward branches S" %FslDir%\VfxUtil" INCLUDED \ FSL harness for ProForth VFX 3.x S" %FslDir%\DynMem" INCLUDED \ Dynamic memory 0 [IF] =========================================================================================== matrix multiply tests -- C language, version 1.0, May 1993 compile with -DN= I usually run a script file time.script 500 >500.times where the script file contains cc -O -DN=$1 mm.c a.out -n (I suggest at least two runs per method to a.out -n alert you to variations. Five or ten runs a.out -t each, giving avg. and std dev. of times is a.out -t best.) ... Contact Mark Smotherman (mark@cs.clemson.edu) for questions, comments, and to report results showing wide variations. E.g., a wide variation appeared on an IBM RS/6000 Model 320 with "cc -O -DN=500 mm.c" (xlc compiler): 500x500 mm - normal algorithm utime 230.81 secs 500x500 mm - normal algorithm utime 230.72 secs 500x500 mm - temporary variable in loop utime 231.00 secs 500x500 mm - temporary variable in loop utime 230.79 secs 500x500 mm - unrolled inner loop, factor of 8 utime 232.09 secs 500x500 mm - unrolled inner loop, factor of 8 utime 231.84 secs 500x500 mm - pointers used to access matrices utime 230.74 secs 500x500 mm - pointers used to access matrices utime 230.45 secs 500x500 mm - blocking, factor of 32 utime 60.40 secs 500x500 mm - blocking, factor of 32 utime 60.57 secs 500x500 mm - interchanged inner loops utime 27.36 secs 500x500 mm - interchanged inner loops utime 27.40 secs 500x500 mm - 20x20 subarray (from D. Warner) utime 9.49 secs 500x500 mm - 20x20 subarray (from D. Warner) utime 9.50 secs 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.10 secs 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.05 secs The algorithms can also be sensitive to TLB thrashing. On a 600x600 test an IBM RS/6000 Model 30 showed variations depending on relative location of the matrices. (The model 30 has 64 TLB entries organized as 2-way set associative.) 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.12 secs 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.23 secs 600x600 mm - 20x20 subarray (from D. Warner) utime 18.87 secs 600x600 mm - 20x20 subarray (from D. Warner) utime 18.64 secs 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.70 secs 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.76 secs Changing the declaration to include 10000 dummy entries between the b and c matrices (suggested by T. Maeno), i.e., double a[N][N],b[N][N],dummy[10000],c[N][N],d[N][N],bt[N][N]; 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.41 secs 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.40 secs 600x600 mm - 20x20 subarray (from D. Warner) utime 16.68 secs 600x600 mm - 20x20 subarray (from D. Warner) utime 16.67 secs 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.97 secs 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.98 secs I hope to add other algorithms (e.g., Strassen-Winograd) in the near future. ======================================================================================= [THEN] \ TOOLS =============================================================================== \ Not portable. : HTAB out @ - spaces ; \ n -- ; step to position n : DEC. ( n -- ) BASE @ >R DECIMAL . R> BASE ! ; extern: DWORD PASCAL GetTickCount( void ) : COUNTER \ -- ms GetTickCount ; 0 VALUE _time_ : TIMER-RESET ( -- ) COUNTER TO _time_ ; : #? ( d -- d ) 2DUP OR 0= IF BL HOLD ELSE # THEN ; : .ELAPSED ( -- ) COUNTER _time_ - 0 <# BL HOLD # # # [char] . HOLD # #? #? #> TYPE ." seconds elapsed." ; : DFVARIABLE CREATE 0e F, ; C" DF@+" DEFINED 0= [IF] : DF@+ ( addr -- addr' ) ( F: -- r ) DUP DF@ DFLOAT+ ; [THEN] C" DF+!" DEFINED 0= [IF] : DF+! ( addr -- ) ( F: r -- ) DUP DF@ F+ DF! ; [THEN] C" DF!+" DEFINED 0= [IF] : DF!+ ( addr -- addr' ) ( F: r -- ) DUP DF! DFLOAT+ ; [THEN] C" DF+!+" DEFINED 0= [IF] : DF+!+ ( addr -- addr' ) ( F: r -- ) DUP DF@ F+ DF!+ ; [THEN] C" *DSUM" DEFINED 0= [IF] : *DSUM ( addr1 addr2 count -- addr1' addr2' ) ( F: -- n ) 0e 0 ?DO SWAP DF@+ SWAP DF@+ F* F+ LOOP ; : *DSUML ( addr1 addr2 count stride2 -- addr1' addr2' ) ( F: -- r ) LOCALS| stride2 | 0e 0 ?DO SWAP DF@+ SWAP DUP DF@ stride2 + F* F+ LOOP ; [THEN] CHAR x CONSTANT 'x' CHAR n CONSTANT 'n' CHAR v CONSTANT 'v' CHAR u CONSTANT 'u' CHAR p CONSTANT 'p' CHAR t CONSTANT 't' CHAR i CONSTANT 'i' CHAR b CONSTANT 'b' CHAR m CONSTANT 'm' CHAR r CONSTANT 'r' CHAR w CONSTANT 'w' \ ===================================================================================== 500 CONSTANT N DOUBLE DMATRIX a{{ DOUBLE DMATRIX b{{ DOUBLE DMATRIX c{{ DOUBLE DMATRIX d{{ DOUBLE DMATRIX bt{{ \ Set coefficients so that result matrix should have row entries equal to (1/2)*n*(n-1)*i in row i : SET-COEFFICIENTS ( -- ) N 0 ?DO N 0 ?DO J S>F FDUP b{{ J I }} DF! a{{ J I }} DF! LOOP LOOP ; : FLUSH-CACHE ( -- ) N 0 ?DO N 0 ?DO 0e d{{ J I }} DF! LOOP LOOP ; FVARIABLE row_sum FVARIABLE sum : CHECK-RESULT ( -- ) 0e row_sum F! N N 1- * 2/ S>F sum F! N 0 ?DO I S>F sum F@ F* row_sum F! N 0 ?DO a{{ J I }} DF@ J S>F F<> IF CR ." error in result entry a{{ " J DEC. I DEC. ." }}: " a{{ J I }} DF@ F. ." <> " J S>F F. UNLOOP UNLOOP EXIT ENDIF b{{ J I }} DF@ J S>F F<> IF CR ." error in result entry b{{ " J DEC. I DEC. ." }}: " b{{ J I }} DF@ F. ." <> " J S>F F. UNLOOP UNLOOP EXIT ENDIF c{{ J I }} DF@ row_sum F@ F<> IF CR ." error in result entry c{{ " J DEC. I DEC. ." }}: " c{{ J I }} DF@ F. ." <> " row_sum F@ F. UNLOOP UNLOOP EXIT ENDIF LOOP LOOP ; : NORMAL() ( -- ) CR N 0 .R 'x' EMIT N 0 .R ." mm - normal algorithm" 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO a{{ J 0 }} b{{ 0 I }} N N DFLOATS *DSUML 2DROP c{{ J I }} DF! LOOP LOOP .ELAPSED ; : TNSQ() ( -- ) 0 LOCALS| K | CR N 0 .R 'x' EMIT N 0 .R ." mm - temporary variable in loop" 54 HTAB TIMER-RESET N 0 ?DO I TO K N 0 ?DO a{{ J 0 }} DF@ b{{ 0 I }} DF@ F* N 1 ?DO a{{ K I }} DF@ b{{ I J }} DF@ F* F+ LOOP c{{ J I }} DF! LOOP LOOP .ELAPSED ; : UNROLL4() ( -- ) 0 0 LOCALS| K S | CR N 0 .R 'x' EMIT N 0 .R ." mm - unrolled inner loop, factor of 4" 54 HTAB TIMER-RESET N 0 ?DO I TO K N 0 ?DO 0e 0 TO S N 3 - 0 ?DO I TO S a{{ K I }} DF@ b{{ I J }} DF@ F* F+ a{{ K I 1+ }} DF@ b{{ I 1+ J }} DF@ F* F+ a{{ K I 2+ }} DF@ b{{ I 2+ J }} DF@ F* F+ a{{ K I 3 + }} DF@ b{{ I 3 + J }} DF@ F* F+ 4 +LOOP N S 4 + ?DO a{{ K I }} DF@ b{{ I J }} DF@ F* F+ LOOP c{{ J I }} DF! LOOP LOOP .ELAPSED ; : UNROLL8() ( -- ) 0 0 LOCALS| K S | CR N 0 .R 'x' EMIT N 0 .R ." mm - unrolled inner loop, factor of 8 " 54 HTAB TIMER-RESET N 0 ?DO I TO K N 0 ?DO 0e 0 TO S N 7 - 0 ?DO I TO S a{{ K I }} DF@ b{{ I J }} DF@ F* F+ a{{ K I 1+ }} DF@ b{{ I 1+ J }} DF@ F* F+ a{{ K I 2+ }} DF@ b{{ I 2+ J }} DF@ F* F+ a{{ K I 3 + }} DF@ b{{ I 3 + J }} DF@ F* F+ a{{ K I 4 + }} DF@ b{{ I 4 + J }} DF@ F* F+ a{{ K I 5 + }} DF@ b{{ I 5 + J }} DF@ F* F+ a{{ K I 6 + }} DF@ b{{ I 6 + J }} DF@ F* F+ a{{ K I 7 + }} DF@ b{{ I 7 + J }} DF@ F* F+ 8 +LOOP N S 8 + ?DO a{{ K I }} DF@ b{{ I J }} DF@ F* F+ LOOP c{{ J I }} DF! LOOP LOOP .ELAPSED ; : UNROLL16() ( -- ) 0 0 LOCALS| K S | CR N 0 .R 'x' EMIT N 0 .R ." mm - unrolled inner loop, factor of 16" 54 HTAB TIMER-RESET N 0 ?DO I TO K N 0 ?DO 0e 0 TO S N 15 - 0 ?DO I TO S a{{ K I }} DF@ b{{ I J }} DF@ F* F+ a{{ K I 1+ }} DF@ b{{ I 1+ J }} DF@ F* F+ a{{ K I 2+ }} DF@ b{{ I 2+ J }} DF@ F* F+ a{{ K I 3 + }} DF@ b{{ I 3 + J }} DF@ F* F+ a{{ K I 4 + }} DF@ b{{ I 4 + J }} DF@ F* F+ a{{ K I 5 + }} DF@ b{{ I 5 + J }} DF@ F* F+ a{{ K I 6 + }} DF@ b{{ I 6 + J }} DF@ F* F+ a{{ K I 7 + }} DF@ b{{ I 7 + J }} DF@ F* F+ a{{ K I 8 + }} DF@ b{{ I 8 + J }} DF@ F* F+ a{{ K I 9 + }} DF@ b{{ I 9 + J }} DF@ F* F+ a{{ K I 10 + }} DF@ b{{ I 10 + J }} DF@ F* F+ a{{ K I 11 + }} DF@ b{{ I 11 + J }} DF@ F* F+ a{{ K I 12 + }} DF@ b{{ I 12 + J }} DF@ F* F+ a{{ K I 13 + }} DF@ b{{ I 13 + J }} DF@ F* F+ a{{ K I 14 + }} DF@ b{{ I 14 + J }} DF@ F* F+ a{{ K I 15 + }} DF@ b{{ I 15 + J }} DF@ F* F+ 16 +LOOP N S 16 + ?DO a{{ K I }} DF@ b{{ I J }} DF@ F* F+ LOOP c{{ J I }} DF! LOOP LOOP .ELAPSED ; : UNROLL ( n -- ) CASE 4 OF UNROLL4() ENDOF 8 OF UNROLL8() ENDOF 16 OF UNROLL16() ENDOF CR ." mm - unrolled inner loop, factor of " DUP DEC. ." not implemented" ENDCASE ; : PNSQ4() ( -- ) 0 LOCALS| S | CR N 0 .R 'x' EMIT N 0 .R ." mm - pointers used, unrolled by 4" 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO 0e a{{ J 0 }} b{{ 0 I }} 0 TO S N 3 - 0 ?DO I TO S SWAP DF@+ SWAP DUP DF@ F* F+ N DFLOATS + SWAP DF@+ SWAP DUP DF@ F* F+ N DFLOATS + SWAP DF@+ SWAP DUP DF@ F* F+ N DFLOATS + SWAP DF@+ SWAP DUP DF@ F* F+ N DFLOATS + 4 +LOOP N S 4 + ?DO SWAP DF@+ SWAP DUP DF@ F* F+ N DFLOATS + LOOP c{{ J I }} DF! 2DROP LOOP LOOP .ELAPSED ; : PNSQ() ( n -- ) DUP 4 = IF DROP PNSQ4() EXIT ENDIF CR N 0 .R 'x' EMIT N 0 .R ." mm - pointers used to access matrices" ?DUP IF ." , unroll factor of " DEC. ." not allowed" EXIT ENDIF 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO 0e a{{ J 0 }} b{{ 0 I }} N 0 ?DO SWAP DF@+ SWAP DUP DF@ N DFLOATS + F* F+ LOOP c{{ J I }} DF! 2DROP LOOP LOOP .ELAPSED ; : TRANSPOSE() ( -- ) 0 LOCALS| K | CR N 0 .R 'x' EMIT N 0 .R ." mm - transposed B matrix" 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO b{{ J I }} DF@ bt{{ I J }} DF! LOOP LOOP N 0 ?DO I TO K N 0 ?DO a{{ J 0 }} DF@ bt{{ I 0 }} DF@ F* N 1 ?DO a{{ K I }} DF@ bt{{ J I }} DF@ F* F+ LOOP c{{ J I }} DF! LOOP LOOP .ELAPSED ; \ from Monica Lam ASPLOS-IV paper : REG_LOOPS() ( -- ) 0 LOCALS| K | CR N 0 .R 'x' EMIT N 0 .R ." mm - interchanged inner loops" 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO 0e c{{ J I }} DF! LOOP LOOP N 0 ?DO I TO K N 0 ?DO a{{ J I }} DF@ N 0 ?DO FDUP b{{ J I }} DF@ F* c{{ K I }} DF+! LOOP FDROP LOOP LOOP .ELAPSED ; \ from Monica Lam ASPLOS-IV paper : TILING() ( step -- ) CR N 0 .R 'x' EMIT N 0 .R DUP 4 N 1+ WITHIN 0= IF CR ." mm - blocking step size of " DEC. ." is unreasonable" EXIT ENDIF ." mm - blocking, factor of " DUP DEC. 0 0 0 LOCALS| K kk jj step | 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO 0e c{{ J I }} DF! LOOP LOOP N 0 ?DO I TO kk N 0 ?DO I TO jj I TO K N 0 ?DO kk step + N MIN kk ?DO a{{ J I }} DF@ jj step + N MIN jj ?DO FDUP b{{ J I }} DF@ F* c{{ K I }} DF+! LOOP FDROP LOOP LOOP step +LOOP step +LOOP .ELAPSED ; \ ********************************************/ \ * Contributed by Robert Debath 26 Nov 1995 */ \ * rdebath@cix.compulink.co.uk */ \ ********************************************/ : ROBERT() ( -- ) CR N 0 .R 'x' EMIT N 0 .R ." mm - Robert's algorithm" 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO b{{ J I }} DF@ bt{{ I J }} DF! LOOP LOOP N 0 ?DO N 0 ?DO a{{ J 0 }} bt{{ I 0 }} N *DSUM 2DROP c{{ J I }} DF! LOOP LOOP .ELAPSED ; 0 [IF] =========================================================================== * Matrix Multiply by Dan Warner, Dept. of Mathematics, Clemson University * * mmbu2.f multiplies matrices a and b * a and b are n by n matrices * nb is the blocking parameter. * the tuning guide indicates nb = 50 is reasonable for the * ibm model 530 hence 25 should be reasonable for the 320 * since the 320 has 32k rather than 64k of cache. * Inner loops unrolled to depth of 2 * The loop functions without clean up code at the end only * if the unrolling occurs to a depth k which divides into n * in this case n must be divisible by 2. * The blocking parameter nb must divide into n if the * multiply is to succeed without clean up code at the end. * * converted to c by Mark Smotherman * note that nb must also be divisible by 2 => cannot use 25, so use 20 =========================================================================== [THEN] DFVARIABLE s10 DFVARIABLE s00 DFVARIABLE s01 DFVARIABLE s11 : WARNER() ( nb -- ) 0 0 0 0 LOCALS| K ii jj kk nb | CR N 0 .R 'x' EMIT N 0 .R N nb MOD N 2 MOD OR IF ." mm - Warner's algorithm, the matrix size " N DEC. ." must be divisible both by the block size " nb DEC. ." and 2." EXIT ENDIF nb 2 MOD IF ." mm - block size for Warner method must be evenly divisible by 2" EXIT ENDIF ." mm - D. Warner's algorithm, subarray " nb 0 .R 'x' EMIT nb 0 .R SPACE 54 HTAB TIMER-RESET N 0 ?DO I TO ii N 0 ?DO I TO jj nb ii + ii ?DO nb jj + jj ?DO 0e c{{ J I }} DF! LOOP LOOP N 0 ?DO I TO kk nb ii + ii ?DO I TO K nb jj + jj ?DO c{{ J I }} DF@ s00 DF! c{{ J I 1+ }} DF@ s01 DF! c{{ J 1+ I }} DF@ s10 DF! c{{ J 1+ I 1+ }} DF@ s11 DF! nb kk + kk ?DO a{{ K I }} DF@ b{{ I J }} DF@ F* s00 DF+! a{{ K I }} DF@ b{{ I J 1+ }} DF@ F* s01 DF+! a{{ K 1+ I }} DF@ b{{ I J }} DF@ F* s10 DF+! a{{ K 1+ I }} DF@ b{{ I J 1+ }} DF@ F* s11 DF+! LOOP s00 DF@ c{{ J I }} DF! s01 DF@ c{{ J I 1+ }} DF! s10 DF@ c{{ J 1+ I }} DF! s11 DF@ c{{ J 1+ I 1+ }} DF! 2 +LOOP 2 +LOOP nb +LOOP nb +LOOP nb +LOOP .ELAPSED ; 0 [IF] =========================================================================== Matrix Multiply tuned for SS-10/30; * Maeno Toshinori * Tokyo Institute of Technology * * Using gcc-2.4.1 (-O2), this program ends in 12 seconds on SS-10/30. * * in original algorithm - sub-area for cache tiling * #define L 20 * #define L2 20 * three 20x20 matrices reside in cache; two may be enough =========================================================================== [THEN] DFVARIABLE t0 DFVARIABLE t1 DFVARIABLE t2 DFVARIABLE t3 DFVARIABLE t4 DFVARIABLE t5 DFVARIABLE t6 DFVARIABLE t7 : MAENO() ( nb -- ) 0 0 0 0 0 LOCALS| K it kt i2 kk lparm | CR N 0 .R 'x' EMIT N 0 .R N lparm MOD N 4 MOD OR IF ." mm - Maeno's algorithm, the matrix size " N DEC. ." must be divisible both by the block size " lparm DEC. ." and 4." EXIT ENDIF lparm 4 MOD IF ." mm - block size for Maeno's method must be evenly divisible by 4" EXIT ENDIF ." mm - T. Maeno's algorithm, subarray " lparm 0 .R 'x' EMIT lparm 0 .R 54 HTAB TIMER-RESET N 0 ?DO N 0 ?DO 0e c{{ J I }} DF! LOOP LOOP N 0 ?DO I TO i2 N 0 ?DO I TO kk i2 lparm + TO it kk lparm + TO kt N 0 ?DO I TO K it i2 ?DO 0e t0 DF! 0e t1 DF! 0e t2 DF! 0e t3 DF! 0e t4 DF! 0e t5 DF! 0e t6 DF! 0e t7 DF! kt kk ?DO a{{ J I }} DF@ FDUP b{{ I K }} DUP DF@+ F* t0 DF+! FDUP DF@+ F* t1 DF+! FDUP DF@+ F* t2 DF+! DF@ F* t3 DF+! a{{ J 1+ I }} DF@ FDUP DF@+ F* t4 DF+! FDUP DF@+ F* t5 DF+! FDUP DF@+ F* t6 DF+! DF@ F* t7 DF+! LOOP t0 DF@ c{{ I J }} DF+!+ t1 DF@ DF+!+ t2 DF@ DF+!+ t3 DF@ DF+! t4 DF@ c{{ I 1+ J }} DF+!+ t5 DF@ DF+!+ t6 DF@ DF+!+ t7 DF@ DF+! 2 +LOOP 4 +LOOP lparm +LOOP lparm +LOOP .ELAPSED ; : MM ( char n -- ) DEPTH 0= ABORT" no algorithm chosen" DEPTH 2 < IF 0 ENDIF LOCALS| ur | & a{{ N N }}malloc malloc-fail? & b{{ N N }}malloc malloc-fail? OR & bt{{ N N }}malloc malloc-fail? OR & c{{ N N }}malloc malloc-fail? OR & d{{ N N }}malloc malloc-fail? OR ABORT" MM :: out of core" SET-COEFFICIENTS FLUSH-CACHE CASE 'n' OF NORMAL() ENDOF 'v' OF TNSQ() ENDOF 'u' OF ur UNROLL ENDOF 'p' OF ur PNSQ() ENDOF 't' OF TRANSPOSE() ENDOF 'i' OF REG_LOOPS() ENDOF 'b' OF ur TILING() ENDOF 'm' OF ur MAENO() ENDOF 'r' OF ROBERT() ENDOF 'w' OF ur WARNER() ENDOF CR ." `" DUP EMIT ." ' is an invalid algorithm" ENDCASE CHECK-RESULT & d{{ }}free & c{{ }}free & bt{{ }}free & b{{ }}free & a{{ }}free ; : ALL-TESTS ( -- ) 'n' mm 'v' mm 'u' 4 mm 'u' 8 mm 'u' 16 mm 'p' mm 'p' 4 mm 't' mm 'i' mm 'b' 20 mm 'r' mm 'm' 20 mm 'w' 20 mm ; : .ABOUT CR ." Try: 'n' mm -- normal" CR ." 'v' mm -- with a temporary variable in the inner loop" CR ." 'u' n mm -- with unrolled (by n) inner loop, n = {4,8,16}" CR ." 'p' mm -- using pointers instead of array notation" CR ." 'p' 4 mm -- using pointers instead of array notation, unrolled by 4 [new]" CR ." 't' mm -- with transposed b matrix" CR ." 'i' mm -- with switched inner loops" CR ." 'b' n mm -- using blocking by n, 4 < n < " N DEC. CR ." 'r' mm -- using Robert's algorithm" CR ." 'r' 8 mm -- using Robert's algorithm unrolled by 8" CR ." 'm' n mm -- using Maeno's algorithm with blocking factor n" CR ." 'w' n mm -- using Warner's algorithm with blocking factor n" CR CR ." ALL-TESTS -- test all algorithms" ; .ABOUT ( * End of Source * ) \ Useful test bits (( : .a{{ \ -- ; display A{{ matrix N 0 ?DO cr N 0 ?DO a{{ J I }} DF@ F. LOOP LOOP ; : .b{{ \ -- ; display B{{ matrix N 0 ?DO cr N 0 ?DO b{{ J I }} DF@ F. LOOP LOOP ; : .c{{ \ -- ; display B{{ matrix N 0 ?DO cr N 0 ?DO c{{ J I }} DF@ F. LOOP LOOP ; : .bt{{ \ -- ; display B{{ matrix N 0 ?DO cr N 0 ?DO bt{{ J I }} DF@ F. LOOP LOOP ; ))