gbadev.org forum archive

Hi everyone. I was reading up on Deku's sound mixing thingy, optimized it, blah blah blah... but then I though, "I think I should do it in assembler." I came up with this code but I know for a FACT that there's something wrong in it... I just don't know WHAT LMAO!

Code:

.section .iwram, "ax", %progbits
.align 4
.arm
.global SndMix, tmpBuffer
.extern sndVars, sngVars, sndChannels

@ r0: samples to mix
SndMix:
stmfd sp!, {r1-r12} @ push {r1-r12}

@ Clear the temp buffer
add r0, r0, #0x01 @ r0 += 0x01 (to avoid LSR'ing 1)
mov r1, #0x04000000 @ r1 = 0x04000000 (REG_BASE)
add r1, r1, #0xD4 @ r1 += 0xD4 (DMA3SAD)
mov r2, #0x00000000 @ r2 = 0x00000000 (NULL)
str r2, [r1, #0x08] @ *(r1 + 0x08) = r2 (DMACNT)
ldr r2, =ClrVal @ r2 = &ClrVal
ldr r3, =tmpBuffer @ r3 = &sndTmpBuffer
mov r4, #0x85000000 @ r4 = 0x85000000 (MEMSET32)
add r4, r4, r0, lsr #0x01 @ r4 += r0 >> 1
stmia r1!, {r2-r4} @ *r4++ = r1, *r4++ = r2, *r4++ = r3

@ Get the global volume
ldr r1, =sngVars @ r1 = &sngVars
ldr r1, [r1] @ r1 = *((int*)r1) (gVol)

@ Loop through the channels and mix data
mov r2, #0x09 @ r2 = 0x09
ldr r3, =sndChannels+0xE0 @ r3 = &(*sndChannels[8])

.LChnLoop:
ldr r4, [r3] @ r4 = *((int*)r3) (CHN_CNT)
ands r5, r4, #0x01 @ \
beq .LChnNA @ if(!(r4 & 0x01)) goto .LChnLoopEnd

.LChnActive:
ldr r5, =tmpBuffer @ r5 = &tmpBuffer
mov r6, r4, lsr #0x03 @ r6 = r4 >> 2 (CHN_VOL)
muls r7, r6, r1 @ r7 = r6 * r1 |
mov r6, r7, lsr #0x06 @ r6 = r7 >> 6 (CHN_VOL)
ldr r7, [r3, #0x04]! @ r7 = *(int*)(r3 += 4) (CHN_POS)
ldr r8, [r3, #0x04]! @ r8 = *(int*)(r3 += 4) (CHN_INC)
ldr r9, [r3, #0x04]! @ r9 = *(int*)(r3 += 4) (CHN_LEN)
ldr r10, [r3, #0x08]! @ r10 = *(int*)(r3 += 8) (CHN_SRC)
add r3, r3, #0x04 @ r3 += 0x04 (CHN_FRQ) (end)
sub r3, r3, #0x1C
stmfd sp!, {r0}

@ Mix down
.LChnActMix:
ldrb r11, [r10, r7, lsr #0x0C] @ r12 = *(char*)(r10+r11)
mul r11, r12, r6 @ r11 = r12 * r6 (CHN_VOL)
ldrsh r12, [r5]
add r11, r11, r12
strh r11, [r5]
add r5, r5, #0x02
add r7, r7, r8
cmp r7, r9
bge .LChnEnd

.LChnLink:
subs r0, r0, #0x01
bne .LChnActMix

.LChnLinkEnd:
ldmfd sp!, {r0}
str r7, [r3, #0x04]
b .LChnLoopEnd @ goto .LChnLoopEnd

.LChnEnd:
ands r11, r4, #0x02
beq .LChnSmpEnd

.LChnSmpEndLoop:
ldr r11, [r3, #0x14]
mov r7, r11
b .LChnLink

.LChnSmpEnd:
mov r7, #0x00
sub r11, r4, #0x02
str r11, [r3]
b .LChnLinkEnd

.LChnNA:
.LChnLoopEnd:
sub r3, r3, #0x1C
subs r2, r2, #0x01 @ \
bne .LChnLoop @ if(r2-- != 0) goto .LChnLoop

.LDownsample:
ldr r1, =tmpBuffer-2
ldr r2, =sndVars
ldr r2, [r2] @ curmixbuffer
sub r2, r2, #0x01

.LDownsampleLoop:
ldrh r3, [r1, #0x02]!
mov r4, r3, lsr #0x08
strb r4, [r2, #0x01]!
subs r0, r0, #0x01
bne .LDownsampleLoop

.LEnd:
ldmfd sp!, {r1-r12} @ pop {r1-r12}
bx lr @ bx to lr

.align 4

ClrVal:
.word 0x00000000

tmpBuffer:
.space 736*2

.end

The externs sndVars, sngVars and sndChannels are as follow:

Code:

typedef struct __attribute__ ((aligned(4))) {
s8 *mixBufferBase;
s8 *curMixBuffer;
u8 activeBuffer;
u32 smpsTilTick;
u32 smpsPerTick;
u32 mixFreq;
u32 rcpMixFreq;
u32 mixBufferSize;
} SND_VARS;

typedef struct __attribute__ ((aligned(4))) {
u32 gVol;
u32 mode;
u32 state;
u8 tickA;
u8 tickB;
u8 row;
... ... ...
} MOD_VARS;

typedef struct __attribute__ ((aligned(4))) {
u32 cnt;
u32 pos;
u32 inc;
u32 len;
u32 loopStart;
s8 *dat;
u32 freq;
} SND_CHANNEL;

...

SND_VARS sndVars;
MOD_VARS sngVars;
SND_CHANNEL sndChannels[9];

I know, I know... lots of room for optimization but I just wanna get this working first. Thanks a lot guys.

Oh, and BTW: I know... lots of old comments and typos lol...

Ruben wrote:

Code:

@ Mix down
.LChnActMix:
ldrb r11, [r10, r7, lsr #0x0C] @ r12 = *(char*)(r10+r11)
mul r11, r12, r6 @ r11 = r12 * r6 (CHN_VOL)

The comments do not match the code ... did you mean "ldrb r12, ..." ?

Some other points (for when it's actually working):

You don't need to save r0-r3 and r12 on the stack; the caller expects these to be clobbered.
IIRC, CpuFastSet (swi 0xC0000) is actually about 10% faster than DMA for filling large amounts of data (provided it's a multiple of 32 bytes long). if the situation allows for it, try using that some time as well. The swi's do have a substantial overhead though.
In .LChnActive, you're using stuff like "ldr r7, [r3, #0x04]!". You don't have to use write-back all the time. It's probably more readable if you didn't. Example:

Code:

ldr r7, [r3, #0x04] @ chn->pos
ldr r8, [r3, #0x08] @ chn->inc
ldr r9, [r3, #0x0C] @ chn->len
ldr r10, [r3, #0x14] @ chn->dat

This would also mean that the addition and subtraction you have now aren't necessary. Of course, if you want to be 1337 (and fast), you could use something like "ldmib r3, {r7-r9}", but you'd have to be careful where your pointer ends up. (Note: untested, but I think it should work.)

On

Code:

.LChnEnd:
ands r11, r4, #0x02
beq .LChnSmpEnd

.LChnSmpEndLoop:
ldr r11, [r3, #0x14]
mov r7, r11
b .LChnLink
.LChnSmpEnd:

Conditional opcodes are teh awesome:

Code:

.LChnEnd:
tst r4, #0x02 @ like ands, but sans extra register
ldrne r7, [r3, #0x14] @ .LChnSmpEndLoop:
bne .LChnLink
.LChnSmpEnd:

In

Code:

ldrsh r12, [r5]
add r11, r11, r12
strh r11, [r5]
add r5, r5, #0x02

The store and addition can be combined. "strh r11, [r5], #2" would increment r5 by 2 after the store. Something similar is also useful in the down-sample part, which, if I read it correctly, currently decrements the pointers before the loop so that the "Rd, [Rn, Op2]!" parts work properly inside it.
Have you checked what the compiler makes of it, by the way? The code doesn't seems too complicated, and I'm not sure the compiler would come up with something that much slower. That said, it's still a nice exercise :)

Hehehe... yeah, I did mean r12. Like I said: those were older comments. But I got it sorted now, it works fine (though it has a weird click...). So now I'm gonna try your suggestions. Thanks a lot!

BTW: Yeah, I did check the code the compiler came up with. About 2-3 scanlines slower, which in a lot of channels adds up.

Always use ldm/stm ops when possible. They will save you 1-2 cycles per register used.

On the Swi calls - pretty much don't do them unless you are copying huge amounts of info. There is somewhere in the neighborhood on 80 cycles of overhead getting in and out of the system crap. Pretty much, you are better off writing your own fast copy in IWRAM and calling that.

In general when optimizing ARM code, if you see alot of mov and cmp/tst type instructions, there are ways to speed things up.

The stack is your enemy. Don't use the stack if you can at all help it.

Now to the code itself:
In LChnLoop:
instead of anding, you can use tst and preserve r5.

In Active:
Your comment says >> 2 but the code is lsr 3.
You may be able to drop the first mov instuction by shifting 9 on the 2nd mov - overflow permitting.
The 4 ldr instuctions feel like they could easily become an ldm instruction of some sort.
Your last 2 instructions cancel each other. Add 4 then subtract 28 (0x1c) from the same register is the same as subtract 24 (0x18)
Are you sure you have to push/pop r0? That's wasteful
at LChnEnd: Your ands could be replace with tst to save a register.
The 3 instructions after your and at LChnEnd should probably be conditionally executed
Is writeback mode even allowed when accessing halfwords? I didn't think so, but if it compiles, fine.
In your downsample loop, if the data is aligned properly, you can use 4 byte reads and 2 byte writes to get a nice speed up. You may also be able to unroll that loop a bit - data dependent.
Don't save r1,r2,r3 or r12. Do use lr (r14) if you need an extra temp register.

"Your comment says >> 2 but the code is lsr 3."

Like I said, older comments :)

"You may be able to drop the first mov instuction by shifting 9 on the 2nd mov - overflow permitting."

What exactly do you mean? That kinda confused me.

"The 4 ldr instuctions feel like they could easily become an ldm instruction of some sort."

Done! :)

"Your last 2 instructions cancel each other. Add 4 then subtract 28 (0x1c) from the same register is the same as subtract 24 (0x18)"

Like I said, I wanted to get this working before optimizing :)

"Are you sure you have to push/pop r0? That's wasteful"

Unless there's some other way to save r0 somewhere else...

"at LChnEnd: Your ands could be replace with tst to save a register."
"The 3 instructions after your and at LChnEnd should probably be conditionally executed"

Done. :)

"In your downsample loop, if the data is aligned properly, you can use 4 byte reads and 2 byte writes to get a nice speed up. You may also be able to unroll that loop a bit - data dependent."

Explain! It's confusing me LOL!

As far as I can tell, this is the routine in C:

Code:

void SndMix(u32 nSamples)
{
// Clear tmp buffer
volatile u32 fill= 0;
REG_DMA3CNT= 0;
REG_DMA3SRC= &fill;
REG_DMA3DST= &tmpBuffer;
REG_DMA3CNT= DMA_ENABLE | DNA_32 | DMA_SRC_FIXED;

// Main variables
u32 mainVol= sngVars.gVolume;
SND_CHANNEL *chn= &sndChannels[8];
u16 *buffy;

// Channel loop
for(ii=0; ii<9; ii++)
{
   // (.LChnLoop)
   u32 cnt= chn->cnt;
   if(cnt&1)    // Only use active channels
   {
      // .LChnActive
      buffy= tmpBuffer;
      u32 vol= mainVol*(cnt>>3)>>6; // p0 ?!? volume, orly ?
      u32 pos= chn->pos;          // Q12 number
      u32 inc= chn->inc;
      u32 len= chn->len;
      u8 *src= chn->dat;

      // Sample loop (.LChnActMix)
      for(jj=0; jj<nSamples; jj++)
      {
         buffy[jj] += src[pos>>12] * vol;

         pos += inc;
         if(pos > len)    // (.LChnEnd)
         {
            // Check for looping sound
            if(cnt & 2)    // (.LChnSmpEndLoop)
            {
               pos= chn->dat;    // p1 ?!?
            }
            else
            {
               chn->cnt -= 2;    // p2 ?!?
               pos= 0;
               break;
            }
         }
      }
      chn->pos= pos;
   }
   chn--;
}

// Down-sample
buffy= tmpBuffer;
u8 *dst= sndVars.mixBufferBase;
for(jj=0; jj<nSamples; jj++)
   *dst++ = *buffy++;
}

If so, I have 5 questions:

What is the usual size of nSamples ? Order of magnitude figure will do.

What happens if the sum of the channels exceeds 16 bits?

(p0) : How exactly is the channel volume hidden inside chn->cnt ? Is it really bits 3 to 31 ?

(p1) : for looping channels, the asm sets the position to chn->dat. Shouldn't this be chn->loopStart ?

(p2) : why subtract the control by 2. Isn't the 'on' bit in the first bit, not the second?

Miked0801 wrote:

Is writeback mode even allowed when accessing halfwords? I didn't think so, but if it compiles, fine.

You're probably thinking about a shifted Op2. Those don't work for halfwords, but writeback is fine.

Ruben wrote:

Miked0801 wrote:

You may be able to drop the first mov instuction by shifting 9 on the 2nd mov - overflow permitting.

What exactly do you mean? That kinda confused me.

"a * (b>>3) >> 6" should be pretty close to "(a*b)>>3>>6 = (a*b)>>9". The difference is a*(b%8)>>6. If a*7 is smaller than 64 (and if a*b fits into 32 bits), you can use the >>9 form without any consequence. If a*7 > 64, there would be a very small but possibly inaudible difference.

Ruben wrote:

Miked0801 wrote:

Are you sure you have to push/pop r0? That's wasteful

Unless there's some other way to save r0 somewhere else...

You still have the link register (lr / r14) left over that you can use don't forget to stack it, of course. I think it's possible to reduce the register usages by one or two by the way.

Ruben wrote:

Miked0801 wrote:

In your down-sample loop, if the data is aligned properly, you can use 4 byte reads and 2 byte writes to get a nice speed up. You may also be able to unroll that loop a bit - data dependent.

Explain! It's confusing me LOL!

Right now you're converting one halfword into one byte per loop. If the source pointer is 32-bit aligned (which it is) and the sndVars buffer is halfword aligned, you can cover two samples in one loop like this:

Code:

@ Assuming :
@ r0 : u16 *dst= tmpBuffer;
@ r1 : u32 *src= sdnVars.mixBufferBase;
@ r2 : nSamples
@ r3 : data
@ r4 : mask (0x00FF00FF)

mov    r4, 0xFF
mov    r4, r4, lsl #16
.LDownSampleLoop:
ldr    r3, [r1], #4       @ p = 0xDDccBBaa
and    r3, r4, r3, lsr #8    @ p = m & p>>8 = 0x00DD00BB
orr    r3, r3, r3, lsr #8    @ p = p | p>>8 = 0x00DDDDBB
strh r3, [r0], #2       @ store 0xDDBB
subs r2, r2, #2
bne    .LDownSampleLoop

This can also be extended to a 4 halfword -> 4 byte loop with ease. And then unrolled to get rid of some loop overhead.

(p0) : How exactly is the channel volume hidden inside chn->cnt ? Is it really bits 3 to 31 ?

Yeahuh. I don't really like using halfwords for stuff (in case you didn't already notice lol).

(p1) : for looping channels, the asm sets the position to chn->dat. Shouldn't this be chn->loopStart ?

... ... ... it does? *double checks and fixes it up*

(p2) : why subtract the control by 2. Isn't the 'on' bit in the first bit, not the second?

Hehe. Yeah, it's fixed up in my new code

"(a*b)>>3>>6 = (a*b)>>9"

Is that basically (assuming r4 is the destination, r5: a, r6: b)

Code:

mul r7, r5, r6
mov r4, r7, lsr #0x09

"You still have the link register (lr / r14) left over"

Wouldn't that mean I still have to do some stacking?

"If the source pointer is 32-bit aligned (which it is) and the sndVars buffer is halfword aligned, you can cover two samples in one loop"

Since I'm not really sure what you mean, I'm assuming you mean something like align 4... IDK... lol...

Oh, and in regards to that code:
1) Why is the tmpBuffer the destination?
2) Why is the 'source' 'mixBufferBase' and not 'curMixBuffer'? IDK if those weren't typos but when I tried that code, NO$GBA just crashed...

OK. After trying out that code (and changing it around to make it work! lol), I figured it didn't really speed up much so I'm just gonna leave it. This is my newest code. Haven't double checked the comments so ignore them... Any room for optimization?

Code:

SndMix:
stmfd sp!, {r4-r11,lr} @ push {r4-r11,lr}
add r5, r0, #0x01 @ r5 = r0 + 0x01
ldr r1, =0x040000D4 @ r1 = 0x040000D4 (DMA3SAD)
mov r2, #0x00000000 @ r2 = 0x00000000 (NULL)
str r2, [r1, #0x08] @ *(r1 + 0x08) = r2 (DMACNT)
ldr r2, =ClrVal @ r2 = &ClrVal
ldr r3, =tmpBuffer @ r3 = &sndTmpBuffer
mov r4, #0x85000000 @ r4 = 0x85000000 (MEMSET32)
add r4, r4, r5, lsr #0x01 @ r4 += r5 >> 0x01
stmia r1, {r2-r4} @ SAD = r2, DAD = r3, CNT = r4
ldr r1, =sngVars @ r1 = &sngVars
ldr r1, [r1] @ r1 = sngVars.gVol
mov r2, #0x09 @ r2 = 0x09
ldr r3, =sndChannels+0xE0 @ r3 = &sndChannels[8]

.LChnLoop:
ldr r4, [r3] @ r4 = chn->cnt
tst r4, #0x01 @ \
beq .LChnNA @ if(!(r4 & 0x01)) goto .LChnLoopEnd

.LChnActive:
ldr r5, =tmpBuffer @ r5 = &tmpBuffer
mul r7, r4, r1 @ r7 = cnt * gVol
mov r6, r7, lsr #0x09 @ r6 = r7 >> 9 (VOLUME)
ldmib r3, {r7-r10} @ r7 = chn->pos, r8 = chn->inc,
@ r9 = chn->len, r10 = chn->data
mov lr, r0 @ lr = samplesToMix

.LChnActMix:
mov r11, r7, lsr #0x0C @ r11 = r7 >> 0x0C
ldrsb r12, [r10, r11] @ r12 = *(char*)(r10+r11)
mul r11, r12, r6 @ r11 = r12 * r6 (VOLUME)
ldrsh r12, [r5] @ r12 = *(short*)(r5)
add r11, r11, r12 @ r11 += r12
strh r11, [r5], #0x02 @ r11 = r5; r5 += 2
add r7, r7, r8 @ r7 += r8 (chn->inc)
cmp r7, r9 @ \
bge .LChnEnd @ if(chn->pos >= chn->len) goto .LChnEnd

.LChnLink:
subs lr, lr, #0x01 @ \
bne .LChnActMix @ if(--r0) goto .LChnActMix

.LChnLinkEnd:
str r7, [r3, #0x04] @ (r3 + 0x04) = r7 (chn->pos)
b .LChnLoopEnd @ goto .LChnLoopEnd

.LChnEnd:
tst r4, #0x02 @\
ldrne r7, [r3, #0x14] @ if(chn->cnt & LOOP)
bne .LChnLink @ r7 = loopStart; goto .LChnLink

.LChnSmpEnd:
mov r7, #0x00 @ r7 = 0x00 (chn->pos)
sub r11, r4, #0x01 @ r4 = r11 - 0x01 (CHN_ACT)
str r11, [r3] @ r3 = r11
b .LChnLinkEnd @ goto .ChnLinkEnd

.LChnNA:
.LChnLoopEnd:
sub r3, r3, #0x1C @ r3 -= sizeof(channel)
subs r2, r2, #0x01 @ \
bne .LChnLoop @ if(--r2) goto .LChnLoop

.LDownsample:
ldr r1, =sndVars @ r1 = &sndVars
ldr r2, [r1, #0x04] @ r2 = sndVars->curMixBuffer
ldr r3, =tmpBuffer @ r3 = &tmpBuffer
mov r4, r2 @ r4 = sndVars->curMixBuffer
add r4, r4, r0 @ r4 += samplesToMix
str r4, [r1, #0x04] @ curMixBuffer = r4

.LDownsampleLoop:
ldrh r4, [r3], #0x02 @ r4 = *(short*)(r3); r3 += 0x02
mov r4, r4, lsr #0x08 @ r4 >>= 0x08
strb r4, [r2], #0x01 @ r2 = r4; r2++
subs r0, r0, #0x01 @ \
bne .LDownsampleLoop @ if(--r0) .LDownsampleLoop

.LEnd:
ldmfd sp!, {r4-r11,lr} @ pop {r4-r11,lr}
bx lr @ bx to lr

.align 4

ClrVal:
.word 0x00000000

tmpBuffer:
.space 736*2

And I also changed this around:

Code:

typedef struct __attribute__ ((aligned(4))) {
u32 cnt;

//bit 1 = on, bit 2 = looped, bit 3-32 = vol

u32 pos;
u32 inc;
u32 len;
s8 *dat;
u32 loopStart;
u32 freq;
} SND_CHANNEL;

This is what I came up with ...

Code:

SndMix:
stmfd sp!, {r4-r11, r14}

@ Clear buffer
mov r2, #0x04000000
add r1, r2, #0xD4
str r2, [r1, #8] @ Clear REG_DMA3CNT
ldr r2,=ClrVal
ldr r3,=tmpBuffer
add r4, r0, #1
add r4, r4, r4, lsr #1
add r4, r4, 0x85000000
stmfd r1!, {r2-r4}

@ Set-up main loop:
ldr r1,=sngVars
ldr r1, [r1] @ master volume
ldr r6,=sndChannels @ chn
mov r2, #9 @ ii

@ Register list:
@ r0 : nSamples
@ r1 : mainVol
@ r2 : ii
@ r3 : tmpBuffer
@ r4 : dst
@ r5 : vol
@ r6 : chn
@ r7 : cnt
@ r8 : pos
@ r9 : inc / mix-data
@ r10: len
@ r11: (start) / data
@ r12: src
@ r14: jj

.LChnLoop:
ldr r7, [r6] @ cnt
tst r7, #1
beq .LChnLoopInc

ldmib r6, {r8-r12} @ r8:pos, r9:inc, r10:len, r12:src
mul r5, r1, r7 @ vol
mov r5, r5, lsr #9
mov r14, r0 @ jj
mov r4, r3 @ buffer
.LMixLoop:
ldrb r11, [r12, r8, lsr #12]
add r8, r8, r9 @ pos += inc. Frees r9 for mix
ldrh r9, [r4]
mla r9, r5, r11, r9 @ mix += sample*vol
strh r9, [r4], #2

@ Test for end-of-data
cmp r8, r10
bge .LEndOfData
.LMixLoopInc:
subs r14, r14, #1
bne .LMixLoop
.LPostMixLoop:
str r8, [r6, #4]
.LChnLoopInc:
add r6, r6, 0x1C
subs r2, r2, #1
bne .LChnLoop
b .LPostChnLoop

@ Out of data: special cases
.LEndOfData:
tst r7, #2 @ Check for looping channel

@ if(loop) then start over
ldreq r8, [r6, #0x14]
beq .LMixLoopInc

@ else reset pos, disable channel and bug out
mov r8, #0
bic r7, r7, #1
str r7, [r3]
b .LPostMixLoop

@ Channel and mix loops done. Yay. Prep for down-sample.
.LPostChnLoop:

@ Down sample:
@ r0: nSamples
@ r1: data
@ r2: curMixBuffer
@ r3: tmpBuffer

ldr r2,=sndVars
ldr r2, [r2, #4] @ sndVars.curMixBuffer ?!?
add r2, r2, #1 @ Yes, I'm moving one up
.LDownSampleLoop:
ldrb r1, [r3], #2 @ get top-byte only
strb r1, [r2], #1
subs r0, r0, #1
bne .LDownSampleLoop

ldmfd sp!, {r4-r11, r14}
bx lr

It's almost the same as what you have now. I should probably point out that it's *cough* nearly 6am here, so the code above may not be accurate.

The main differences are that I'm not reloading tmpBuffer anywhere and that you're now using signed data instead of unsigned (which should it be, signed or unsigned?). You're also updating the mix-buffer now, something that wasn't there before.

About the number of samples: is this guaranteed to be even ? If it's odd, the rounding you apply for the tmpBuffer will adds an extra sample that isn't actually there.

There's also an interesting little tidbit about the mul instruction that you might want to know. The cycle-time for it is 1S + mI, with m being the number of significant bytes in the second operand. If you order the operands so that the second one is smaller, you may gain up to 3 cycles (the equivalent of 3 instructions).

EDIT : loop indentation

Last edited by Cearn on Tue Mar 18, 2008 7:54 pm; edited 1 time in total

Nice looking code Cearn. Well formated and commented. It's hard enough to read assembler without comments :) The way you downsample is also very well done.

The code looks pretty tight. The only thing I am wondering at this point is about the architecture/algorithms used. The inner loops are a mess. They're interleaved and the system is forced to jump back and forth. It also required both an end of buffer check and a decrement check. It seems to me that either one or the other should be sufficient with just a little care on the data creation side. Also, depending on where in RAM this is being stored, buffering the output to words may make sense.

Also,there is the possibly of using 4 byte reads and possibly buffering the output the 4-byte writes in the down sampler - but I have no idea if that is even needed because without data, I can't tell how much time is spent in any of the loops.

Finally, as this uses DMA, be very careful not to mix this code with other DMA code across interrupts. That causes very hard to track bugs.

Miked0801 wrote:

Nice looking code Cearn. Well formated and commented. It's hard enough to read assembler without comments :) The way you downsample is also very well done.

The code looks pretty tight. The only thing I am wondering at this point is about the architecture/algorithms used. The inner loops are a mess. They're interleaved and the system is forced to jump back and forth.

Actually, they aren't really interleaved, but since I messed up with my indentation I can see how it may have appeared that way. There are two loops, ChnLoop and MixLoop. They run like this:

Code:

.LChnLoop:
init for mixer

.LMixLoop:
mix samples

(end-of-data check)

subs r14, r14, #1
bne .LMixloop

.LPostMixLoop:
update chn->pos

subs r2, r2, #1
bne .LChnLoop

The mix loop runs from .LMixLoop to .LPostMixLoop, and is completely contained witin the channel loop. But what about .LEndOfData ? .LEndOfData is separate from both loops. It essentially functions as a sub-routine. Now, it does 'return' to two different places: one still inside the mix loop, and one after it. The reason for the different exits is that the latter return would result in a 'break' statement (see the C version) inside the look, and you can avoid that by simply skipping past the loop parts from the sub-routine.

The alternative is to put the end-of-data handler inside the mix loop and skip over it if pos is not out of bounds. Since branches are evil and pos will usually be smaller than len (ok, so that's an assumption, but it makes sense), it's better to have two infrequent branches than one frequent one. GCC actually does the same ... although it wastes quite a bit of energy with the stack and member loading.

You could also place the whole .LEndOfData part after the bx so that the "b .LPostChnLoop" can be removed.

Miked0801 wrote:

It also required both an end of buffer check and a decrement check. It seems to me that either one or the other should be sufficient with just a little care on the data creation side.

Yeah, it there maybe ways around it, but you still need to have separate checks for the number of samples and the end of the channel data in some form. If pos were decreasing, you could use subs/bmi instead of add/cmp/bge.

You can also fiddle with the chn updating in the channel loop and remove one or two instructions there, but I'm not sure that's really worth it; the loop only being executed 9 times and all.

Question:

Code:

.LEndOfData:
tst r7, #2 @ Check for looping channel

@ if(loop) then start over
ldreq r8, [r6, #0x14]
beq .LMixLoopInc

Did you mean 'ldrne' and 'bne'?

Ruben wrote:

Question:

Code:

.LEndOfData:
tst r7, #2 @ Check for looping channel

@ if(loop) then start over
ldreq r8, [r6, #0x14]
beq .LMixLoopInc

Did you mean 'ldrne' and 'bne'?

Uhm, yeah. If a set bit 1 means looping sound, then it should be -ne.

Cearn wrote:

EDIT : loop indentation

Interesting. Is there any sort of commonly accepted style for indentation of assembly language?
_________________
-- Where is he?
-- Who?
-- You know, the human.
-- I think he moved to Tilwick.

tepples wrote:

Cearn wrote:

EDIT : loop indentation

Interesting. Is there any sort of commonly accepted style for indentation of assembly language?

Whatever doesn't cause syntax errors. Some assemblers require labels to be aligned to the left column.
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."

tepples wrote:

Cearn wrote:

EDIT : loop indentation

Interesting. Is there any sort of commonly accepted style for indentation of assembly language?

Not that I know of, no. But using indentation makes loops stand out and hence improve readability.

Loops generally follow this pattern:

Code:

pre-loop code
.LLoop:
loop code
...
test for exit
bne .LLoop
post-loop code

I find it easier to see where the loop ends if I use

Code:

pre-loop code
.LLoop:
   loop code
   ...
   test for exit
   bne .LLoop
post-loop code

I prefer the format

Code:

label:
instr reg, reg
instr reg, xxx
blah blah
label2:
etc reg, #???

i.e with labels left aligned, the instructions on the second tab, and parameters on the third tab (tabsize 8). *Personally*, I think indentation throws around the parameters, and makes it harder to read.

OK, so I've moved on to a higher quality mixing routine and decided to use Cearn's 4byte -> 2byte idea. But when I tried it, it gave strange clicks and, according to No$GBA, it contains bugs. This is my loop ATM:

Code:

@ r0: samples to mix
@ r1: address of pS3MVars, which contains the current location of the mix buffer, etc
@ r2: current address of mix buffer
@ r3: address of tmpBuffer
@ pS3MVars + 0x18 = current mix buffer address
.LPreMixdownLoop:
ldr r1, =pS3MVars
ldr r2, [r1, #0x18]
ldr r3, =tmpBuffer

add r4, r2, r0 @ update the location of the mix buffer
str r4, [r1, #0x18]

mov r4, #0xFF
orr r4, r4, r4, lsl #0x10 @ m = 0x00FF00FF

.LMixdownLoop:
ldr r5, [r3], #0x04 @ d = 0xAAbbCCdd
and r5, r4, r5, lsr #0x08 @ d = d & m >> 8 = 0x00AA00CC
orr r5, r5, r5, lsr #0x08 @ d = d | d >> 8 = 0x00aaAACC
strh r5, [r2], #0x02
ldr r5, [r3], #0x04 @ d = 0xAAbbCCdd
and r5, r4, r5, lsr #0x08 @ d = d & m >> 8 = 0x00AA00CC
orr r5, r5, r5, lsr #0x08 @ d = d | d >> 8 = 0x00aaAACC
strh r5, [r2], #0x02

subs r0, r0, #0x04
bpl .LMixdownLoop

Oh, yeah. Cearn, you asked how many samples it mixes at a time right? Well, I'm not exactly sure of the figure but this is the code which calls the function:

Code:

//in sound.h...
#define S3M_TICKFPA 8
#define S3M_BUFSIZE 528

//in sound.c...
int pS3MSetTempo(int tempo) {
pS3MVars.sTT -= pS3MVars.sPT;
pS3MVars.sPT = pS3MDivide(pS3MVars.mFr*5<<S3M_TICKFPA, tempo*2); //smps p/tick = (mix freq * 5 << S3M_TICKFPA) / (tempo * 2)
pS3MVars.sTT += pS3MVars.sPT;

return 0;
}

int gS3MUpdate() {
int sL = S3M_BUFSIZE, sTT;
int (*mix)();

if(pS3MModVars.sigMode == 1) mix = pS3MMixdownS;
else mix = pS3MMixdown;

while(sL) {
if(pS3MVars.sTT < (1 << S3M_TICKFPA)) {
pS3MVars.sTT += pS3MVars.sPT; //samples til tick += smps p/tick
pS3MUpdateMod(); //process the row
}

sTT = pS3MVars.sTT >> S3M_TICKFPA;
if(sTT < sL) {
mix(sTT);
sL -= sTT;
pS3MVars.sTT -= (sTT << S3M_TICKFPA);
} else {
sTT = sL;

mix(sTT);
pS3MVars.sTT -= (sL << S3M_TICKFPA);
return -1;
}
}

return 0;
}

gbadev.org forum archive

ASM > Complete newbie question...

#152534 - Ruben - Mon Mar 17, 2008 11:30 am

#152535 - Ruben - Mon Mar 17, 2008 11:33 am

#152547 - Cearn - Mon Mar 17, 2008 4:41 pm

#152585 - Ruben - Mon Mar 17, 2008 11:56 pm

#152586 - Miked0801 - Mon Mar 17, 2008 11:58 pm

#152588 - Ruben - Tue Mar 18, 2008 12:34 am

#152603 - Cearn - Tue Mar 18, 2008 2:58 am

#152606 - Ruben - Tue Mar 18, 2008 3:15 am

#152608 - Ruben - Tue Mar 18, 2008 4:23 am

#152610 - Ruben - Tue Mar 18, 2008 5:06 am

#152614 - Cearn - Tue Mar 18, 2008 5:52 am

#152635 - Miked0801 - Tue Mar 18, 2008 7:09 pm

#152644 - Cearn - Tue Mar 18, 2008 8:34 pm

#152667 - Ruben - Wed Mar 19, 2008 3:38 am

#152680 - Cearn - Wed Mar 19, 2008 12:06 pm

#152694 - tepples - Wed Mar 19, 2008 7:59 pm

#152710 - Dwedit - Wed Mar 19, 2008 11:02 pm

#152905 - Cearn - Sat Mar 22, 2008 12:59 pm

#152908 - eKid - Sat Mar 22, 2008 2:09 pm

#155115 - Ruben - Fri Apr 25, 2008 3:44 am

#155116 - Ruben - Fri Apr 25, 2008 3:51 am