Release id #106466 : PulseSounds
Thought I'd share the code behind the guts of this technique since I've had a few questions since it would appear I wasn't clear enough on my explanation..
Equally, if anyone has any optimisations to this stuff then please share, anything to make this stuff faster is good for the whole world :)
Anyway..
The main bit generation part for each voice looks like this.. You can see the idea, we just step through eor_buffer using a 8.8 fixed point fraction, marking a bit for each waveform transition.. The seperate high/low parts are for the differing steps on each side of the pulse waveform itself..
For simplicity, I negate all the values on entry, hence all the lax/sbx stuff since I didn't want to be debugging these backwards..
!macro Pulse_Train_Generator ~.entry, ~.ppl, ~.pph, ~.psl0, ~.psh0, ~.psl1, ~.psh1, .store, .bit {
.entry:
; Negate all the step values for the voice..
lax #$00
.psl0 = * + 1
sbx #$40
stx .ipsl0
.psh0 = * + 1
sbc #$00
sta .ipsh0
lax #$00
.psl1 = * + 1
sbx #$40
stx .ipsl1
.psh1 = * + 1
sbc #$00
sta .ipsh1
.pph = * + 1
ldy #0
.ppl = * + 1
ldx #0
; Use clear carry here to branch to correct entry point..
.generate:
clc
.branch_mod = * + 1
bcc .continue
.continue:
!if .store {
lda #.bit
sta eor_buffer,y
} else {
lda eor_buffer,y
eor #.bit
sta eor_buffer,y
}
txa
.ipsl0 = * + 1
sbx #0
tya
.ipsh0 = * + 1
sbc #0
tay
bcs .underflow_lo
.enter_hi:
!if .store {
lda #.bit
sta eor_buffer,y
} else {
lda eor_buffer,y
eor #.bit
sta eor_buffer,y
}
txa
.ipsl1 = * + 1
sbx #0
tya
.ipsh1 = * + 1
sbc #0
tay
bcc .continue
.underflow_hi:
stx .ppl
sty .pph
lda #.continue - (.branch_mod + 1)
sta .branch_mod
rts
.underflow_lo
stx .ppl
sty .pph
lda #.enter_hi - (.branch_mod + 1)
sta .branch_mod
rts
}
This then gets instantiated like so:
+Pulse_Train_Generator ~PGEN0, ~PPL_0, ~PPH_0, ~PSL0_0, ~PSH0_0, ~PSL1_0, ~PSH1_0, 1, 1
+Pulse_Train_Generator ~PGEN1, ~PPL_1, ~PPH_1, ~PSL0_1, ~PSH0_1, ~PSL1_1, ~PSH1_1, 0, 2
+Pulse_Train_Generator ~PGEN2, ~PPL_2, ~PPH_2, ~PSL0_2, ~PSH0_2, ~PSL1_2, ~PSH1_2, 0, 4
So we have 3 unique voice generators, each controlling one seperate bit of the eor_buffer..
Next up, the volume table generation.. This is fairly simple.. Given current volumes of all 3 voices with each voice having one unique bit, we generate all possible outcomes of the bit combinations, storing this in a table..
First a little helper macro for it all..
!macro fold4 .identity, .offset, .a0, .a1, .a2 {
.vt = VolumeTable
lda .identity
!if .a0 != NULL {
adc .a0
}
sta .vt+.offset+0
adc .a1
sta .vt+.offset+1
adc .a2
sta .vt+.offset+3
lda .vt+.offset+0
adc .a2
sta .vt+.offset+2
}
And then the main part:
ldx #0 ; Set this to allow a constant to be propagated throughout all volume results..
txa
ora v0
sta v0bit
txa
ora v1
sta v1bit
txa
ora v2
sta v2bit
clc
; 543210
;0 00000
;1 00001
;2 00010
;3 00011
.vt = VolumeTable
txa
sta .vt+0
lda v0bit
sta .vt+1
lda v1bit
sta .vt+2
adc v0
sta .vt+3
;************************************************
; 543210
;4 00100
+fold4 v2bit, 4, NULL, v0, v1
The idea behind generating the extra v0bit v1bit v2bit with a constant in is that the 'folding' is done in such a way that if we wanted to have extra data stored in the output samples for the upper 4 bits, that it'll only ever be added in once, hence we can roll through this whole thing using additions with no clearing of carries, or worrying about the special bits getting cleared by adding them to themselves..
It isn't actually used, but is where you'd load up filter settings for the top bits of D418..
For completeness, here's the rest of the table up to 6 voices..
;8 01000
+fold4 v3bit, 8, NULL, v0, v1
;12 01100
+fold4 .vt+8, 12, v2, v0, v1
;************************************************
; 543210
;16 10000
+fold4 v4bit, 16, NULL, v0, v1
;20 10100
+fold4 .vt+16, 20, v2, v0, v1
;24 11000
+fold4 .vt+16, 24, v3, v0, v1
;28 11100
+fold4 .vt+24, 28, v2, v0, v1
;************************************************
; 543210
;32 100000
+fold4 v5bit, 32, NULL, v0, v1
;36 100100
+fold4 .vt+32, 36, v2, v0, v1
;40 101000
+fold4 .vt+32, 40, v3, v0, v1
;44 101100
+fold4 .vt+40, 44, v2, v0, v1
;48 110000
+fold4 .vt+32, 48, v4, v0, v1
;52 110100
+fold4 .vt+48, 52, v2, v0, v1
;56 111000
+fold4 .vt+48, 56, v3, v0, v1
;60 111100
+fold4 .vt+56, 60, v2, v0, v1
It's not as optimal as it could be.. There's a few points where LAX could be used to save reloading, but I had enough trouble getting my head around a magic order to do this automatically anyway, which as you can see from the above, I failed, hence this solution :)
Next the actual output buffer generation.. This is the bit that takes the sparse eor_buffer and generate output samples and length, and also does the volume table lookup..
First the unrolled version:
!zone
!macro Generate_Pulse_Buffer .source, .size {
ldy SampleWriteIndex
!for .i, 0, .size {
lda .source + .i
!if .i != 0 {
beq +
}
eor zp_pgen_previous
sta zp_pgen_previous
tax
lda VolumeTable,x
sta sample_buffer,y
lda #.i
clc
sbc zp_len_previous
sta length_buffer,y
lda #.i
sta zp_len_previous
iny
lda #0
sta .source + .i
+
}
sty SampleWriteIndex
}
And the rolled version..
ldy SampleWriteIndex
ldx #0
lda eor_buffer,x
jmp .nz
.loop:
lda eor_buffer,x
bne .nz
inx
bne .loop
sty SampleWriteIndex
rts
.nz:
eor .vol+1
sta .vol+1
.vol:
lda VolumeTable
sta sample_buffer,y
txa
clc
.prev sbc #0
sta length_buffer,y
stx .prev+1
iny
lda #0
sta eor_buffer,x
inx
bne .loop
sty SampleWriteIndex
rts
The idea is that each 256 samples it automatically puts out one sample, then encodes the remainder as lengths from the previous, and stores the lengths and samples out.. Also clearing the eor-buffer as it goes..
I'm sure there's some big speedups in this bit.. I was starting to go blinding with this function though..
And finally the NMI code itself, although this is kind of obvious..
!pseudopc $80 {
sta+1 IRQ0_a
irq_sample_index = * + 1
lda sample_buffer
sta $d418
inc+1 irq_sample_index
irq_length_index = * + 1
lda length_buffer
sta $dd06
inc+1 irq_length_index
IRQ0_a = * + 1
lda #$00
jmp $dd0c
The NMIs are setup so that they trigger at a fixed point, intialised just after the beginning of a scanline, which gives a nice spacing for badlines, and also for back to back samples (1 scanline) with badlines.. There's no science to the timing setup, just ears, and this works best to me..
Anyway, that's it really.. All the code to do this stuff :)