mul1616 lda mT1+ 0 ; 3 sta zp_fl0 ; 3 sta zp_fh0 ; 3 eor#255 ; 2 sta zp_gl0 ; 3 sta zp_gh0 ; 3 lda mT1+ 1 ; 3 sta zp_fl1 ; 3 sta zp_fh1 ; 3 eor#255 ; 2 sta zp_gl1 ; 3 sta zp_gh1 ; 3 clc ; 2 ldy mT2+0 ; 3 lda (zp_fl0),y ; 5.5 adc (zp_gl0),y ; 5.5 sta mRes+0 ; 3 ldx#0 ; 2 lda (zp_fh0),y ; 5.5 adc (zp_gh0),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 adc (zp_fl1),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 adc (zp_gl1),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 ldy mT2+1 ; 3 adc (zp_fl0),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 adc (zp_gl0),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 sbc ofste_3f,x ; 4 sta mRes+1 ; 3 txa ; 2 ldx#$bf ; 2 adc (zp_fh0),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 adc (zp_gh0),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 adc (zp_fl1),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 adc (zp_gl1),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 ldy mT2+0 ; 3 adc (zp_fh1),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 adc (zp_gh1),y ; 5.5 bcc *+3 ; 2.5 inx ; 1 sbc ofste_80-$bf,x ; 4 sta mRes+2 ; 3 txa ; 2 ldy mT2+1 ; 3 adc (zp_fh1),y ; 5.5 clc ; 2 adc (zp_gh1),y ; 5.5 sta mRes+3 ; 3 rts ; total=204.5 ofste_3f .byt $3f,$40,$41,$42,$43,$44 .dsb <$bf-*,0 ofste_80 .byt $80,$81,$82,$83,$84,$85,$86

f=lambda x:x*x//4 g=lambda x:(0x4000-f(x-255))&0xffff dumpArrayToA65(fo, "flo", [lo(f(i)) for i in range(512)]) dumpArrayToA65(fo, "fhi", [hi(f(i)) for i in range(512)]) dumpArrayToA65(fo, "glo", [lo(g(i)) for i in range(512)]) dumpArrayToA65(fo, "ghi", [hi(g(i)) for i in range(512)])

Same, I can easily take 4 off mine at the expense of ~36 bytes more zp, but I don't consider that elegant or worthwhile.

r pc=c000

z

LDA $FB - A:00 X:0A Y:00 SP:eb ..-..IZ. 72210316 CLC - A:00 X:0A Y:00 SP:eb ..-..IZC 72210476 RTS - A:00 X:0A Y:00 SP:eb ..-..IZ. 72210520

;World's fastest 16x16 unsigned mult for 6502 ;you can go faster, but not without more code and/or data ;and being less elegant and harder to follow. ;by Repose 2017 ;tables of squares ;sqr(x)=x^2/4 ;negsqr(x)=(255-x)^2/4 sqrlo=$c000;511 bytes sqrhi=$c200;511 bytes negsqrlo=$c400;511 bytes negsqrhi=$c600;511 bytes ;pointers to square tables above p_sqr_lo=$8b;2 bytes p_sqr_hi=$8d;2 bytes p_invsqr_lo=$8f;2 bytes p_invsqr_hi=$91;2 bytes ;the inputs and outputs x0=$fb;multiplier, 2 bytes x1=$fc y0=$fd;multiplicand, 2 bytes y1=$fe z0=$80;product, 4 bytes z1=$81 z2=$82 z3=$83 ;not shown is a routine to make the tables ;also you need to init the pointers' high bytes to the tables umult16: ;set multiplier as x0 lda x0 sta p_sqr_lo sta p_sqr_hi eor #$ff sta p_invsqr_lo sta p_invsqr_hi;17 ldy y0 sec lda (p_sqr_lo),y sbc (p_invsqr_lo),y;note these two lines taken as 11 total sta z0;x0*y0l lda (p_sqr_hi),y sbc (p_invsqr_hi),y sta c1a+1;x0*y0h;31 ;c1a means column 1, row a (partial product to be added later) ldy y1 ;sec ;notice that the high byte of sub above is always +ve lda (p_sqr_lo),y sbc (p_invsqr_lo),y sta c1b+1;x0*y1l lda (p_sqr_hi),y sbc (p_invsqr_hi),y sta c2a+1;x0*y1h;31 ;set multiplier as x1 lda x1 sta p_sqr_lo sta p_sqr_hi eor #$ff sta p_invsqr_lo sta p_invsqr_hi;17 ldy y0 ;sec lda (p_sqr_lo),y sbc (p_invsqr_lo),y sta c1c+1;x1*y0l lda (p_sqr_hi),y sbc (p_invsqr_hi),y sta c2b+1;x1*y1h;31 ldy y1 ;sec lda (p_sqr_lo),y sbc (p_invsqr_lo),y sta c2c+1;x1*y1l lda (p_sqr_hi),y sbc (p_invsqr_hi),y sta z3;x1*y1h;31 ;4*31+2*17 so far=158 ;add partials ;-add first two numbers in column 1 ;jmp do_adds;put in zp to save 3 cycles :) do_adds: clc c1a lda #0 c1b adc #0;add first two rows of column 1 sta z1;9 ;-continue to first two numbers in column 2 c2a lda #0 c2b adc #0 sta z2;7 bcc c1c;3 taken/9 not taken, avg 6 inc z3 clc ;-add last number of column 1 (row c) c1c lda #0 adc z1 sta z1;8 ;-add last number of column 2 c2c lda #0 adc z2 sta z2;8 bcc fin;3/7 avg 5 inc z3 ;9+7+6+8+8+5=43 fin rts