fo=open("tables.inc","w") lo=lambda x:x&255 hi=lambda x:(x>>8) f=lambda x:x*x//4 g=lambda x:(0x4000-f(x-255))&0xffff dumpArrayToA65(fo, "flo", [lo(f(i)) for i in range(512)]) dumpArrayToA65(fo, "fhi", [hi(f(i)) for i in range(512)]) dumpArrayToA65(fo, "glo", [lo(g(i)) for i in range(512)]) dumpArrayToA65(fo, "ghi", [hi(g(i)) for i in range(512)]) dumpArrayToA65(fo, "id", [lo( i ) for i in range(512)]) fo=open("mc.inc","w") mAcc=0 for i in range(4): for j in range(4): mAcc-=0x40<<(8*(1+i+j)) initialValue = [((mAcc>>s)&0xff) for s in range(0,64,8)] def addB(yv,zp,tb): global lasty if yv!=lasty: print(""" ldy mT2+{yv}""".format(yv=yv), file=fo) lasty=yv print(""" adc ({zp}),y""".format(zp=zp), file=fo) if tb<7: print(""" bcc *+4:inx:clc""", file=fo) else: print(""" bcc *+3:clc""", file=fo) lasty=None for tb in range(8): print(""" ; tb={tb} """.format(tb=tb),file=fo) if tb==0: print(""" clc """,file=fo) print(""" ldx#0 """,file=fo) print(""" lda #${iv:02x} """.format(iv=initialValue[tb]),file=fo) else: print(""" txa""", file=fo) if tb<7: print(""" ldx#0 """,file=fo) print(""" adc#${iv:02x}""".format(iv=initialValue[tb]), file=fo) if initialValue[tb]>0xef: print(""" bcc *+4:inx:clc""", file=fo) for j in range(4): i=tb-j if i in [0,1,2,3]: addB(i, "zp_fl{j}".format(j=j), tb) addB(i, "zp_gl{j}".format(j=j), tb) i=tb-j-1 if i in [0,1,2,3]: addB(i, "zp_fh{j}".format(j=j), tb) addB(i, "zp_gh{j}".format(j=j), tb) print(""" sta mRes+{tb}""".format(tb=tb), file=fo) fo.close()
lda mT1+ 0 sta zp_fl0 sta zp_fh0 eor#255 sta zp_gl0 sta zp_gh0
stx z3 sec sbc z3 sta z2
Good job, that's right in the range of what I thought was possible.
I have an improvement; instead of trashing A to change the multiplier, you can prestuff pointers with the 4 multipliers.
by offset $4000, doesn't that reduce the domain?
Correction is fast, it's only stx z3 sec sbc z3 sta z2
Also yours shouldn't be any faster than my approach from what I can tell, though I do have some ideas to speed up adds again.. we'll see :)
sec sbc id,x sta z2
About the correction, I think you're adding things up wrong. I only use correction for those columns where it's faster, and I found the break even at 7 adds, so it should work. All but the outer 1 or 2 columns can use it.