gbadev.org forum archive

This is a read-only mirror of the content originally found on forum.gbadev.org (now offline), salvaged from Wayback machine copies. A new forum can be found here.

ASM > Any room for optimization?

#157777 - Ruben - Thu May 29, 2008 2:39 am

Hey guys. I recently finished a stereo mixer which I should be about to release but it takes a bit long to mix down each channel. I was wondering if there's any room for improvement?

Code:
.section .iwram, "ax", %progbits
.align
.global S3MTmp

S3MTmp:
        .space 304*2*16/8

.size S3MTmp, .-S3MTmp
.align

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

.section .iwram, "ax", %progbits
.align
.arm

.global        S3MMixdown
.extern S3MBuffer
.extern S3MVrs

S3MMixdown:
.LStart:
        stmfd      sp!, {r4-r11, r14}               @ save used registers

.LPreClearLoop:
        ldr        r1, =S3MTmp                      @ r1 = S3MBuffer
        mov        r2, r0                           @ r2 = samples to mix
        mov        r3, #0x00                        @ r3 = 0
        mov        r4, #0x00                        @ r4 = 0
        mov        r5, #0x00                        @ r5 = 0
        mov        r6, #0x00                        @ r6 = 0

.LClearLoop:
        stmia      r1!, {r3-r6}                     @ store 4 samples
        subs       r2, r2, #0x04                    @ subtract those 4 samples
        bhi        .LClearLoop                      @ if positive, then keep clearing

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

.LPreChannelLoop:
        mov        r1, #0x20                        @ r1 = channel count (32)
        ldr        r2, =S3MVrs                      @ \
        add        r2, r2, #0x28                    @  r2 = pointer to channels

.LChannelLoop:
        ldr        r3, [r2]                         @ r3 = cnt
        tst        r3, #0x01                        @ \
        beq        .LChannelLoopEnd                 @  if(not active) dont do this channel

.LPreSampleLoop:
        mov        r4, r0                           @ r4 = samples to mix
        ldmib      r2, {r5-r8}                      @ r5 = source
                                                    @ r6 = position
                                                    @ r7 = increment
                                                    @ r8 = length
        ldr        r9, =S3MTmp                      @ r9 = S3MTmp
        ldr        r11, =S3MVrs                     @ \
        ldrb       r12, [r11, #0x15]                @  r10 = master volume
       
        mov        r10, r3, asr #0x02
        and        r10, r10, #0x7F
        mul        r10, r12, r10                    @ r10 = channel vol (left) * master vol
        mov        r10, r10, asr #0x04              @ r10 = final volume
       
        mov        r11, r3, asr #0x09
        and        r11, r11, #0x7F
        mul        r11, r12, r11                    @ r11 = channel vol (right) * master vol
        mov        r11, r11, asr #0x04              @ r11 = final volume

.LSampleLoop:
@ do left channel
        add        r12, r5, r6, lsr #0x0C           @ r12 = position in ROM
        ldrsb      r12, [r12]                       @ r12 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r14, r5, r6, lsr #0x0C           @ r14 = position in ROM
        ldrsb      r14, [r14]                       @ r14 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r12, r12, r14, lsl #0x10         @ r12 = 2 samples
       
        ldr        r14, [r9]                        @ r14 = mixed data
        mla        r14, r10, r12, r14               @ r14 = final data
        str        r14, [r9], #0x04                 @ store mixed data and increment

@ do right channel
        ldr        r14, [r9]                        @ r14 = mixed data
        mla        r14, r11, r12, r14               @ r11 = final data
        str        r14, [r9], #0x04                 @ store mixed data and increment
       
        cmp        r6, r8                           @ \
        bge        .LSampleNoData                   @  if(pos >= length) do some stuff

.LSampleLoopEnd:
        subs       r4, r4, #0x02                    @ \
        bhi        .LSampleLoop                     @  if(--samples left) keep looping

.LChannelLoopEnd:
        str        r3, [r2], #0x08                  @ store cnt
        str        r6, [r2], #0x10                  @ store pos; channel++
        subs       r1, r1, #0x01                    @ \
        bne        .LChannelLoop                    @  if(--channels left != 0) loop again
        b          .LPreDownsampleLoop              @ skip the end of sample handler

.LSampleNoData:
        tst        r3, #0x02                        @ test for loop flag
        ldrne      r6, [r2, #0x14]                  @ if(looping) get loop begin position
        bne        .LSampleLoopEnd                  @  keep looping
       
        bic        r3, r3, #0x01                    @ clear the active flag
        b          .LChannelLoopEnd                 @ stop looping

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

.LPreDownsampleLoop:
        ldr        r1, =S3MTmp                      @ r1 = S3MTmp
        ldr        r2, =S3MVrs                      @ r2 = S3MVrs
        ldr        r3, [r2, #0x04]!                 @ r3 = left buffer location
        add        r4, r3, #0x260                   @ r4 = right buffer location
        add        r5, r3, r0                       @ r4 = buffer location + samples mixed
        str        r5, [r2]                         @ store this location

.LDownsampleLoop:
        ldrsh      r5, [r1], #0x02                  @ get mixed samples (left)
        movs       r5, r5, asr #0x07
        cmnmi      r5, #0x80
        mvnlt      r5, #0x7F
        cmppl      r5, #0x7F
        movgt      r5, #0x7F
        strb       r5, [r3], #0x01                  @ store
        ldrsh      r5, [r1], #0x02                  @ get mixed samples (left)
        movs       r5, r5, asr #0x07
        cmnmi      r5, #0x80
        mvnlt      r5, #0x7F
        cmppl      r5, #0x7F
        movgt      r5, #0x7F
        strb       r5, [r3], #0x01                  @ store
       
        ldrsh      r5, [r1], #0x02                  @ get mixed samples (right)
        movs       r5, r5, asr #0x07
        cmnmi      r5, #0x80
        mvnlt      r5, #0x7F
        cmppl      r5, #0x7F
        movgt      r5, #0x7F
        strb       r5, [r4], #0x01                  @ store
        ldrsh      r5, [r1], #0x02                  @ get mixed samples (right)
        movs       r5, r5, asr #0x07
        cmnmi      r5, #0x80
        mvnlt      r5, #0x7F
        cmppl      r5, #0x7F
        movgt      r5, #0x7F
        strb       r5, [r4], #0x01                  @ store
       
        subs       r0, r0, #0x02
        bcs        .LDownsampleLoop

.LEnd:
        ldmfd      sp!, {r4-r11, r14}               @ restore used registers
        bx         lr                               @ return


These are the structs
Code:
typedef struct tChn {
 u32 cs; //channel status (bit0: on. bit1: loop. bit2-8: vol left. bit9-F: vol right)
 s8* wp; //wave pointer
 u32 cp; //channel position (20.12 fixed point)
 u32 ci; //channel increment (20.12 fixed point)
 u32 cl; //channel length (20.12 fixed point)
 u32 lb; //loop begin (20.12 fixed point)
} tChn;

typedef struct tSndArea {
//mixer stuff
 s8       *bb; //base buffer (+0)
 s8       *mb; //mix buffer (current) (+4)
 
 u32       fr; //mix frequency (+8)
 u32       st; //samples til tick (20.12FP) (+C)
 u32       sp; //samples per tick (20.12FP) (+10)
 
 u8        ab; //active buffer (+14)
 u8        mv; //master volume (default 15) (+15.. COINCIDENCE?! lol)
 u16       sc; //sound control (+16)
 
//player stuff
 tS3M     *pm; //pointer to module (+18)
 u8       *nb; //next byte (+1C)
 
 u8        mp; //module speed (+20)
 u8        mt; //module tempo (+21)
 u8        mc; //mixer count/tick/tap (+22)
 
 u8        cr; //current row (+23)
 u8        co; //current order (+24)
 u8        oc; //order count (+25)
 
 tChn  pc[32]; //player channels (+26)
 
 u8        pd; //pattern delays left
 
 u32       ef;     //effect flags
 u8        ep[32]; //effect paramaters
} tSndArea;


The mono code for the mixer is really fast but after I added those 3 instructions for the right channel, it's slowed down quite a lot. Thanks a lot guys.[/code]

EDIT: Fixed up the misaligned text

#157786 - DekuTree64 - Thu May 29, 2008 5:31 am

Loads, multiplies, and stores are some of the slowest instructions, so I'm not surprised that those 3 would add a lot.

I'm a bit rusty on mixer optimization and I haven't thorougly examined the code, but here are a few things I noticed:

One "free" speed boost you can have here is to swap the middle 2 registers on the multiplies. The cycles taken depends on the number of "significant" bytes in the SECOND multiplying register. Since you have 2 samples packed together, that register should go first, and the volume second, because the volume should be all zeroes in the upper 3 bytes.

That said, I'd recommend packing the left and right volumes together, rather than 2 samples. Saves at least the one add per 2 samples, and probably will save some registers too. But in that case, you would want to multiply with the volume register first, followed by the one byte sample.

Since you're not using a whole lot of registers even as it is, how about unrolling that loop 4 or 8 times and ldm/stm'ing the temp buffer values?

Also, make sure the temp buffer is in IWRAM. Even if it is pretty big, it's only needed during the mixer function, so it can go on the stack. Of course, if that's the only thing in your game that uses a lot of stack space, then it's just as bad as giving it a permanent chunk. But if a few other speed-critical sections can get some use out of a big stack, then there's no guilt in using it to speed up the mixer.
_________________
___________
The best optimization is to do nothing at all.
Therefore a fully optimized program doesn't exist.
-Deku

#157787 - eKid - Thu May 29, 2008 5:36 am

Code:

@ do left channel
        add        r12, r5, r6, lsr #0x0C           @ r12 = position in ROM
        ldrsb      r12, [r12]                       @ r12 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r14, r5, r6, lsr #0x0C           @ r14 = position in ROM
        ldrsb      r14, [r14]                       @ r14 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r12, r12, r14, lsl #0x10         @ r12 = 2 samples


You can add a shifted register to the LDRSB address, instead of using an add before the load.

Code:

@ do left channel
        ldrsb      r12, [r5, r6, lsr#0x0C]          @ r12 = sample
        add        r6, r6, r7                       @ pos += inc
        ldrsb      r14, [r5, r6, lsr#0x0C]     @ r14 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r12, r12, r14, lsl #0x10         @ r12 = 2 samples

#157788 - Ruben - Thu May 29, 2008 5:42 am

eKid wrote:
Code:

@ do left channel
        add        r12, r5, r6, lsr #0x0C           @ r12 = position in ROM
        ldrsb      r12, [r12]                       @ r12 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r14, r5, r6, lsr #0x0C           @ r14 = position in ROM
        ldrsb      r14, [r14]                       @ r14 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r12, r12, r14, lsl #0x10         @ r12 = 2 samples


You can add a shifted register to the LDRSB address, instead of using an add before the load.

Code:

@ do left channel
        ldrsb      r12, [r5, r6, lsr#0x0C]          @ r12 = sample
        add        r6, r6, r7                       @ pos += inc
        ldrsb      r14, [r5, r6, lsr#0x0C]     @ r14 = sample
        add        r6, r6, r7                       @ pos += inc
        add        r12, r12, r14, lsl #0x10         @ r12 = 2 samples
Nope, already tried that many times before. 'ldrb r0, [r1, r2, lsr #0x02]' is OK but 'ldrsb' doesn't work for it.

#157789 - eKid - Thu May 29, 2008 5:45 am

oh.. oops :P
My mixer takes unsigned samples... so I can use that trick. (aren't the samples in S3M unsigned?)

#157790 - Ruben - Thu May 29, 2008 5:49 am

eKid wrote:
oh.. oops :P
My mixer takes unsigned samples... so I can use that trick. (aren't the samples in S3M unsigned?)
Yeah, they are but I XORed them with 128 'cause I couldn't get unsigned samples working right >.<" And besides, most games use signed samples so I thought I'd join in :D Lol.

#157791 - Ruben - Thu May 29, 2008 6:01 am

Quote:
That said, I'd recommend packing the left and right volumes together, rather than 2 samples. Saves at least the one add per 2 samples, and probably will save some registers too. But in that case, you would want to multiply with the volume register first, followed by the one byte sample.
So, basically, load a sample into (i.e.) r0, multiply that sample by left vol into r1, multiply the sample by right vol into r2, add them together with a 'lsl #0x10' and store? If so, how? It confused me the second I tried it 'cause I'm using all the registers.

Quote:

Since you're not using a whole lot of registers even as it is, how about unrolling that loop 4 or 8 times and ldm/stm'ing the temp buffer values?
Umm... I suppose you mean that I don't use too many registers within the sample loop which means that I would need to stack one or two, right? 'Cause I'm using r0-r14 (except r13) atm...

Quote:

Also, make sure the temp buffer is in IWRAM. Even if it is pretty big, it's only needed during the mixer function, so it can go on the stack. Of course, if that's the only thing in your game that uses a lot of stack space, then it's just as bad as giving it a permanent chunk. But if a few other speed-critical sections can get some use out of a big stack, then there's no guilt in using it to speed up the mixer.
... Huh??????

#157792 - DekuTree64 - Thu May 29, 2008 6:26 am

Ruben wrote:
So, basically, load a sample into (i.e.) r0, multiply that sample by left vol into r1, multiply the sample by right vol into r2, add them together with a 'lsl #0x10' and store?

Not quite. Before going into the sample loop, load the left and right volumes, and add them together with lsl #0x10. Then inside the sample loop, a single mla multiplies one sample with both volumes, and adds to the result. Very similar to what you're doing, just multiplying 1 sample by 2 volumes, rather than 2 samples by 1 volume. You will end up with interleaved left/right samples, but that can be sorted out in the downsampling step.

Ruben wrote:
Umm... I suppose you mean that I don't use too many registers within the sample loop which means that I would need to stack one or two, right?

Yeah. The sample loop gets a whole lot of repetitions, so if stacking a few registers can save a few cycles per sample, do it.

Ruben wrote:
DekuTree64 wrote:
blah blah blah ... temp buffer ... IWRAM ... blah
... Huh??????

I'm talking about your global "S3MTmp", which seems to be the buffer that you're storing the intermediate mixed samples in before downsampling at the end. Since you have to load and store to it for every sample for every channel, putting it in fast memory saves a whole lot of cycles.

...but I just noticed that it's declared right there at the top, in IWRAM, so you can just leave it as is.
_________________
___________
The best optimization is to do nothing at all.
Therefore a fully optimized program doesn't exist.
-Deku

#157793 - Ruben - Thu May 29, 2008 7:00 am

Quote:
Not quite. Before going into the sample loop, load the left and right volumes, and add them together with lsl #0x10. Then inside the sample loop, a single mla multiplies one sample with both...

Tried that and it actually slowed down the code... :S Maybe because it takes double the iterations?

Quote:
Yeah. The sample loop gets a whole lot of repetitions, so if stacking a few registers can save a few cycles per sample, do it.

Theoretically, I'm supposed to be saving around 1 or 2 cycles per sample but the CPU usage (according to no$gba) is still the same as when I used str twice. Right now I'm using double samples rather than left and right at the same time and I'm stacking r0 and r1 and using then as temps (r0: sample, r1: left sample, r12: right sample) then stmia'ing them to the temp buffer but it has the same speed. Should I still stack or should I revert back to what it was?

#157794 - kusma - Thu May 29, 2008 10:19 am

Oh, there's so many things! Unsigned mixing is definately faster, but it's a bit more complex since you need to track the DC offsets per frame rendered. Also, in my modplayer, I moved the looping-logic outside of the raw mixer. You can have a look at how that's done here. So, once you've done both of those optimizations, unrolling is your next friend, since the loads and stores from your temp-buffer become much faster, and you don't get that expensive branch for every sample.

#157795 - Cearn - Thu May 29, 2008 11:15 am

Just a few small things:

Code:
        mov        r10, r3, asr #0x02
        and        r10, r10, #0x7F
        mul        r10, r12, r10                @ r10 = channel vol (left) * master vol
        mov        r10, r10, asr #0x04          @ r10 = final volume
       
        mov        r11, r3, asr #0x09
        and        r11, r11, #0x7F
        mul        r11, r12, r11                @ r11 = channel vol (right) * master vol
        mov        r11, r11, asr #0x04          @ r11 = final volume

Preload the 0x7F and use a shifted AND:
Code:
        mov        r14, 0x7F

        and        r10, r14, r3, asr #2
        mul        r10, r12, r10                @ r10 = channel vol (left) * master vol
        mov        r10, r10, asr #0x04          @ r10 = final volume
       
        and        r11, r14, r3, asr #9
        mul        r11, r12, r11                @ r11 = channel vol (right) * master vol
        mov        r11, r11, asr #0x04          @ r11 = final volume


Code:
        ldr        r14, [r9]                    @ r14 = mixed data
        mla        r14, r10, r12, r14           @ r14 = final data
        str        r14, [r9], #0x04             @ store mixed data and increment

@ do right channel
        ldr        r14, [r9]                    @ r14 = mixed data
        mla        r14, r11, r12, r14           @ r11 = final data
        str        r14, [r9], #0x04             @ store mixed data and increment

You can combine the left/right loads and stores. Assuming r10 is free:
Code:
        ldmia      r9, {r10, r14}                    @ Load left/right data
        mla        r10, r10, r12, r10               @ r14 = final data
        mla        r14, r11, r12, r14               @ r11 = final data
        stmia      r9!, {r10, 14}                   @ store mixed data and increment

About singed vs unsigned: the problem is probably the center-point. Signed data centers around 0, but in unsigned samples the zero-line is at 128. When you then add samples, the center-line shifts as well. For example, an empty unsigned sample will contain 128s. Adding two of them gives 256, which definitely ([/spelling police]) does not count as empty.

The true range is always [-M/2, +M/2), i.e. a signed sample (s). For unsigned samples (u), the bar is raised by M/2, so you have s = u-M/2. This should still be true after the summation.
Code:
Signed :  S = ∑(0 to n) s[i]
vs
Unsigned: U - M/2 =  ∑(0 to n) (u[i]-M/2)
                U = ( ∑ u[i] ) - (n-1)*M/2

In other words, you have to subtract (n-1)*M/2 from the final sum over all unsigned samples.

Of course, since I hardly know anything about sound programming, I could be entirely wrong here :P

EDIT : forgot about volume.
Things are slightly different when the samples have different volumes. This scales the center-point as well:
Code:
U - M/2 = ∑(0 to n) V[i]*(u[i]-M/2)
      U = ∑ V[i]*u[i] - (∑V[i])*M/2 + M/2


But yeah, unsigned should make things faster. The adds for the ldrsb can be combined into a single ldrb. It's also useful in the downsample loop, since clamping between 0 and a power of two can be done in two instructions rather than four.

Also:
Ruben wrote:
Code:
 u8        cr; //current row (+23)
 u8        co; //current order (+24)
 u8        oc; //order count (+25)
 
 tChn  pc[32]; //player channels (+26)

 u8        pd; //pattern delays left
 u32       ef;     //effect flags

Watch your member packing (and I'm talking about struct members here >_>). tChn requires 32bit alignment, so it's actually at 0x28. There is a 2-byte gap between oc and pc, as well as a 3-byte gap between pd and ef. Consider moving pd into the gap between oc and pc for a slightly smaller struct.

#157796 - Ruben - Thu May 29, 2008 12:47 pm

Wow! That was so much better than what I had expected! Thanks a MILLION guys!

What I ended up doing was following... uh... kusma's (?) advice on calling different functions for different things along with Cearn's ldmia/stmia suggestion. I'm not going to move onto unsigned samples 'cause I never understood how to properly mix them together so I may as well as stick to signed mode.

Now to figure out why the right speaker clicks... ROFL!

#159170 - Ruben - Thu Jun 26, 2008 1:52 pm

OK... umm... I went all out and thought of a faster method... this one:

Code:
.equ SND_BUFFERLEN, 304
.equ SND_ACCBITS, 12

.section .iwram, "ax", %progbits
.align
SndTmp:
 .space SND_BUFFERLEN*2*16/8
.size SndTmp, .-SndTmp

.section .iwram, "ax", %progbits
.global SndMix
.extern sa__
.align
.arm

@ r0: samples to mix
SndMix:

r14 is a pointer to the channels, r12 is the number of channels left (16 now), and r0 is the samples to mix
...

.L1:
   ldrb      r11, [r14]         @ \
   tst      r11, #0x01         @  make sure channel is active
   beq      .L5            @ /
   
   stmfd      sp!, {r0,r12,r14}
   
   ldrb      r1, [r14, #0x02]      @ \
   ldrb      r2, [r14, #0x03]      @  r1: l/r volumes
   add      r1, r1, r2, lsl #0x10      @ /
   ldmib      r14, {r2-r4}         @ r2 = sample pointer
                     @ r3 = position
                     @ r4 = increment
   add      r2, r2, #0x10
   ldr      r5, [r2, #-0x04]      @ r5 = length
   
   ldr      r6, =SndTmp         @ r6 = pointer to temp buffer
   add      r7, r2, r3, lsr #SND_ACCBITS
   add      r14, r4, lsr #SND_ACCBITS
   
   mla      r8, r4, r0, r3
   cmp      r5, r8, lsr #(SND_ACCBITS-2)
   blt      .L2

.Lfast:
   ldmia       r6, {r8-r11}         @ \
   ldrsb      r12, [r7], r14         @  |
   mla      r8, r12, r1, r8         @  |
   ldrsb      r12, [r7], r14         @  |
   mla      r9, r12, r1, r9         @  do 4 samples
   ldrsb      r12, [r7], r14         @  |
   mla      r10, r12, r1, r10      @  |
   ldrsb      r12, [r7], r14         @  |
   mla      r11, r12, r1, r11      @ /
   stmia       r6!, {r8-r11}         @ store mixed samples
   subs      r0, r0, #0x04
   bhi      .Lfast
   
   mov      r7, r0, lsl #0x02      @ \
   mla      r3, r4, r7, r3         @  save new position
   b      .L4

.L2:
   ldr       r8, [r6]         @ get mixed samples
   ldrsb      r12, [r7], r14         @  get the sample
   mla      r8, r12, r1, r8         @ get final value
   str       r8, [r6], #0x04         @ store mixed samples
   
   add      r3, r3, r4
   cmp      r5, r3, lsr #SND_ACCBITS
   ble      .L6

.L3:
   subs      r0, r0, #0x01
   bhi      .L2

.L4:
   ldmfd      sp!, {r0,r12,r14}
   str      r3, [r14, #0x08]      @ store position

.L5:
   add      r14, r14, #0x20         @ go to the next channel
   subs      r12, r12, #0x01         @ channels left--
   bne      .L1
   b      .L7            @ if == 0, skip the EOF handler

.L6:
   ldrh      r8, [r2, #-0x0E]      @ get sample stat
   tst      r8, #0x4000         @ check if it's looping
   ldrne      r3, [r2, #-0x08]      @  get loop point
   movne      r3, r3, lsl #SND_ACCBITS   @  put it in FP format
   bne      .L3            @  keep looping
   
   ldr      r7, [sp, #-0x04]
   bic      r8, r8, #0x01         @ otherwise stop channel
   strh      r8, [r7]         @ store channel control
   b      .L4            @  stop looping
...


I thought that by incrementing by a set value using "ldrsb r0, [r1], r2", it would speed it up but it seems to be reading... ... ... garbage to say the least. Any ideas why?

I also changed the channel structure around like this:
Code:
typedef struct channel {
 u8      sf;   //status flags               (+00)
 s8      pa;   //panning               (+01)
 u8      lv;   //left  vol   (0->64)            (+02)
 u8      rv;   //right vol   (0->64)            (+03)
 voice*      wp;   //voice pointer               (+04)
 u32      po;   //position               (+08)
 u32      in;   //increment               (+0C)
 u8      wl;   //waits left               (+10)
 u8      pr;   //program               (+11)
 u8      vo;   //volume               (+12)
 u8      co;   //chn. origin               (+13)
 u8*      nb;   //next byte               (+14)
 u32      ja;   //jump address               (+18)
 u8      at;   //attack envelope            (+1C)
 u8      de;   //decay envelope            (+1D)
 u8      su;   //sustain envelope            (+1E)
 u8      re;   //release envelope            (+1F)
} channel;

#159171 - eKid - Thu Jun 26, 2008 2:07 pm

Code:

.Lfast:
   ldmia       r6, {r8-r11}         @ \
   ldrsb      r12, [r7], r14         @  |
   mla      r8, r12, r1, r8         @  |
   ldrsb      r12, [r7], r14         @  |
   mla      r9, r12, r1, r9         @  do 4 samples
   ldrsb      r12, [r7], r14         @  |
   mla      r10, r12, r1, r10      @  |
   ldrsb      r12, [r7], r14         @  |
   mla      r11, r12, r1, r11      @ /
   stmia       r6!, {r8-r11}         @ store mixed samples
   subs      r0, r0, #0x04
   bhi      .Lfast


I can't really understand how this works? Isn't r7 a fixed point number?

#159172 - Ruben - Thu Jun 26, 2008 2:11 pm

eKid wrote:

...
I can't really understand how this works? Isn't r7 a fixed point number?


r7 = sound wave position + (position >> accuracy)
r14 = increment >> accuracy

:P

#159173 - Ruben - Thu Jun 26, 2008 2:19 pm

Oh, I just found a small bug... "add r14, r4, lsr #SND_ACCBITS" should be "mov r14, r4, lsr #SND_ACCBITS". But it's still buggy...

#159174 - eKid - Thu Jun 26, 2008 2:23 pm

Wouldn't shifting out all the accuracy bits cause a bunch of noise/failure? :)

#159176 - Ruben - Thu Jun 26, 2008 2:32 pm

eKid wrote:
Wouldn't shifting out all the accuracy bits cause a bunch of noise/failure? :)

Noise? Failure? o.O *confused*

#159179 - Cearn - Thu Jun 26, 2008 3:51 pm

What eKid means is that if you shift down before adding, the additions won't be in fixed point anymore and you stand to lose a lot of precision that way.

It's similar to this situation:
Code:
float a= 0.5;
int b= int(a+a+a+a);   // b = 0.5+0.5+0.5+0.5 = 2

int a= 0.5;            // cast to int removes sub-integer accuracy
int b = a+a+a+a;       // b = 0+0+0+0 = 0


Also,
Code:
    ldrb    r11, [r14]
    ldrb    r1, [r14, #0x02]
    ldrb    r2, [r14, #0x03]
    add     r1, r1, r2, lsl #0x10

works a little faster like this: (10 cycles vs 6)
Code:
    ldr     r11, [r14]              // R L x x
    mov     r1, r11, lsr #16        // 0 0 R L
    orr     r1, r1, r1, lsl #8      // 0 R x L
    bic     r1, r1, #0xFF00         // 0 R 0 L

And if you want to be really 1337, you can simply start with the optimal arrangement in the struct. That said, I doubt this is really the place where you can gain that much speed. You'll probably gain more from switching to unsigned data.

#159213 - Ruben - Fri Jun 27, 2008 5:00 am

Nice... that is some serious thinking you can do! :P Anyway... is there anyway to contact you outside of the forum for code optimization? (I don't want to disclose the source just yet! :P) I'm pretty sure there's more room for optimization in the actual "sound.c" file so yeah.