x0=$fb x1=$fc y0=$fd y1=$fe x0_sqr_lo=$8b;2 bytes x0_sqr_hi=$8d x0_negsqr_lo=$8f x0_negsqr_hi=$91 x1_sqr_lo=$93;2 bytes x1_sqr_hi=$95 x1_negsqr_lo=$97 x1_negsqr_hi=$99 sqrlo=$c000;510 bytes sqrhi=$c200 negsqrlo=$c400 negsqrhi=$c600 umult16: ;init zp square tables pointers lda x0 sta x0_sqr_lo sta x0_sqr_hi eor #$ff sta x0_negsqr_lo sta x0_negsqr_hi;17 lda x1 sta x1_sqr_lo sta x1_sqr_hi eor #$ff sta x1_negsqr_lo sta x1_negsqr_hi;17 ldx #0;start column 0 ldy y0 SEC LDA (x0_sqr_lo),y SBC (x0_negsqr_lo),y sta z0;x0*y0 lo, C=1 ;start column 1 ;Y=y0 clc LDA (x0_sqr_hi),y;x0*y0 hi ADC (x1_sqr_lo),y;+x1*y0 lo bcc c1s1;8.5/11.5 avg inx clc c1s1 sbc (x0_negsqr_hi),y;x0*y0 hi bcc c1s2 dex clc c1s2 sbc (x1_negsqr_lo),y;-x1*y0 lo bcc c1s3 dex clc c1s3 ldy y1 adc (x0_sqr_lo),y;x0*y1 lo bcc c1s4 inx clc c1s4 SBC (x0_negsqr_lo),y;A=x0*y1 lo bcc c1s5 dex clc ;end of column 1 c1s5 sta z1;column 1 ;start column 2 ldy y0 txa;carries from column 1 ldx #0;reset carries clc adc (x1_sqr_hi),y;+x1*y0 hi bcc c2s1 inx c2s1 sbc (x1_negsqr_hi),y;-x1*y0 hi bcc c2s2 dex clc c2s2 ldy y1 adc (x0_sqr_hi),y;+x0*y1 hi bcc c2s3 inx clc c2s3 adc (x1_sqr_lo),y;+x1*y1 lo bcc c2s4 inx clc c2s4 sbc (x0_negsqr_hi),y;-x0*y1 hi bcc c2s5 dex clc c2s5 sbc (x1_negsqr_lo),y;-x1*y1 lo bcc c2s6 dex clc c2s6 sta z2;column 2 ;start column 3 ;Y=y1 txa;carries from column 2 clc adc (x1_sqr_hi),y;+x1*y1 hi sbc (x1_negsqr_hi),y;-x1*y1 hi ;shouldn't be any carries in the msb sta z3;column 3 rts makesqrtables: ;init zp square tables pointers lda #>sqrlo sta x0_sqr_lo+1 sta x1_sqr_lo+1 lda #>sqrhi sta x0_sqr_hi+1 sta x1_sqr_hi+1 lda #>negsqrlo sta x0_negsqr_lo+1 sta x1_negsqr_lo+1 lda #>negsqrhi sta x0_negsqr_hi+1 sta x1_negsqr_hi+1 ;generate sqr(x)=x^2/4 ldx #$00 txa !by $c9 ; CMP #immediate - skip TYA and clear carry flag makesqrtables_loop1: tya adc #$00 makesqrtables_sm1: sta sqrhi,x tay cmp #$40 txa ror makesqrtables_sm2: adc #$00 sta makesqrtables_sm2+1 inx makesqrtables_sm3: sta sqrlo,x bne makesqrtables_loop1 inc makesqrtables_sm3+2 inc makesqrtables_sm1+2 clc iny bne makesqrtables_loop1 ;generate negsqr(x)=(255-x)^2/4 ldx #$00 ldy #$ff maketables_loop2: lda sqrhi+1,x sta negsqrhi+$100,x lda sqrhi,x sta negsqrhi,y lda sqrlo+1,x sta negsqrlo+$100,x lda sqrlo,x sta negsqrlo,y dey inx bne maketables_loop2: rts
y1 y0 x1 x0 ------ x0*y0h x0*y0l x1*y0h x1*y0l x0*y1h x0*y1l x1*y1h x1*y1l ---------------- 24x24bits x2 x1 x0 y2 y1 y0 --------------- y0x0h y0x0l y0x1h y0x1l y0x2h y0x2l y1x0h y1x0l y1x1h y1x1l y1x2h y1x2l y2x0h y2x0l y2x1h y2x1l y2x2h y2x2l
fo=open("tables.inc","w") lo=lambda x:x&255 hi=lambda x:(x>>8) f=lambda x:x*x//4 g=lambda x:(0x4000-f(x-255))&0xffff dumpArrayToA65(fo, "flo", [lo(f(i)) for i in range(512)]) dumpArrayToA65(fo, "fhi", [hi(f(i)) for i in range(512)]) dumpArrayToA65(fo, "glo", [lo(g(i)) for i in range(512)]) dumpArrayToA65(fo, "ghi", [hi(g(i)) for i in range(512)]) dumpArrayToA65(fo, "id", [lo( i ) for i in range(512)]) fo=open("mc.inc","w") mAcc=0 for i in range(4): for j in range(4): mAcc-=0x40<<(8*(1+i+j)) initialValue = [((mAcc>>s)&0xff) for s in range(0,64,8)] def addB(yv,zp,tb): global lasty if yv!=lasty: print(""" ldy mT2+{yv}""".format(yv=yv), file=fo) lasty=yv print(""" adc ({zp}),y""".format(zp=zp), file=fo) if tb<7: print(""" bcc *+4:inx:clc""", file=fo) else: print(""" bcc *+3:clc""", file=fo) lasty=None for tb in range(8): print(""" ; tb={tb} """.format(tb=tb),file=fo) if tb==0: print(""" clc """,file=fo) print(""" ldx#0 """,file=fo) print(""" lda #${iv:02x} """.format(iv=initialValue[tb]),file=fo) else: print(""" txa""", file=fo) if tb<7: print(""" ldx#0 """,file=fo) print(""" adc#${iv:02x}""".format(iv=initialValue[tb]), file=fo) if initialValue[tb]>0xef: print(""" bcc *+4:inx:clc""", file=fo) for j in range(4): i=tb-j if i in [0,1,2,3]: addB(i, "zp_fl{j}".format(j=j), tb) addB(i, "zp_gl{j}".format(j=j), tb) i=tb-j-1 if i in [0,1,2,3]: addB(i, "zp_fh{j}".format(j=j), tb) addB(i, "zp_gh{j}".format(j=j), tb) print(""" sta mRes+{tb}""".format(tb=tb), file=fo) fo.close()
lda mT1+ 0 sta zp_fl0 sta zp_fh0 eor#255 sta zp_gl0 sta zp_gh0
stx z3 sec sbc z3 sta z2
Good job, that's right in the range of what I thought was possible.
I have an improvement; instead of trashing A to change the multiplier, you can prestuff pointers with the 4 multipliers.
by offset $4000, doesn't that reduce the domain?
Correction is fast, it's only stx z3 sec sbc z3 sta z2
Also yours shouldn't be any faster than my approach from what I can tell, though I do have some ideas to speed up adds again.. we'll see :)
sec sbc id,x sta z2