gbadev.org forum archive

This is a read-only mirror of the content originally found on forum.gbadev.org (now offline), salvaged from Wayback machine copies. A new forum can be found here.

ASM > Complete newbie question...

#152534 - Ruben - Mon Mar 17, 2008 11:30 am

Hi everyone. I was reading up on Deku's sound mixing thingy, optimized it, blah blah blah... but then I though, "I think I should do it in assembler." I came up with this code but I know for a FACT that there's something wrong in it... I just don't know WHAT LMAO!

Code:
.section .iwram, "ax", %progbits
.align 4
.arm
.global   SndMix, tmpBuffer
.extern sndVars, sngVars, sndChannels

@ r0: samples to mix
SndMix:
 stmfd    sp!, {r1-r12}                @ push {r1-r12}

@ Clear the temp buffer
 add      r0, r0, #0x01                @ r0   += 0x01 (to avoid LSR'ing 1)
 mov      r1, #0x04000000              @ r1    = 0x04000000 (REG_BASE)
 add      r1, r1, #0xD4                @ r1   += 0xD4 (DMA3SAD)
 mov      r2, #0x00000000              @ r2    = 0x00000000 (NULL)
 str      r2, [r1, #0x08]              @ *(r1 + 0x08) = r2 (DMACNT)
 ldr      r2, =ClrVal                  @ r2    = &ClrVal
 ldr      r3, =tmpBuffer               @ r3    = &sndTmpBuffer
 mov      r4, #0x85000000              @ r4    = 0x85000000 (MEMSET32)
 add      r4, r4, r0, lsr #0x01        @ r4   += r0 >> 1
 stmia    r1!, {r2-r4}                 @ *r4++ = r1, *r4++ = r2, *r4++ = r3
 
@ Get the global volume
 ldr      r1, =sngVars                 @ r1    = &sngVars
 ldr      r1, [r1]                     @ r1    = *((int*)r1) (gVol)

@ Loop through the channels and mix data
 mov      r2, #0x09                    @ r2    = 0x09
 ldr      r3, =sndChannels+0xE0        @ r3    = &(*sndChannels[8])

.LChnLoop:
 ldr      r4, [r3]                     @ r4  = *((int*)r3)      (CHN_CNT)
 ands     r5, r4, #0x01                @ \
 beq      .LChnNA                      @  if(!(r4 & 0x01)) goto .LChnLoopEnd

.LChnActive:
 ldr      r5, =tmpBuffer               @ r5  = &tmpBuffer
 mov      r6, r4, lsr #0x03            @ r6  = r4 >> 2          (CHN_VOL)
 muls     r7, r6, r1                   @ r7  = r6 * r1              |
 mov      r6, r7, lsr #0x06            @ r6  = r7 >> 6          (CHN_VOL)
 ldr      r7, [r3, #0x04]!             @ r7  = *(int*)(r3 += 4) (CHN_POS)
 ldr      r8, [r3, #0x04]!             @ r8  = *(int*)(r3 += 4) (CHN_INC)
 ldr      r9, [r3, #0x04]!             @ r9  = *(int*)(r3 += 4) (CHN_LEN)
 ldr      r10, [r3, #0x08]!            @ r10 = *(int*)(r3 += 8) (CHN_SRC)
 add      r3, r3, #0x04                @ r3 += 0x04             (CHN_FRQ) (end)
 sub      r3, r3, #0x1C
 stmfd    sp!, {r0}

@ Mix down
.LChnActMix:
 ldrb    r11, [r10, r7, lsr #0x0C]    @ r12 = *(char*)(r10+r11)
 mul      r11, r12, r6                 @ r11 = r12 * r6 (CHN_VOL)
 ldrsh    r12, [r5]
 add      r11, r11, r12
 strh     r11, [r5]
 add      r5, r5, #0x02
 add      r7, r7, r8
 cmp      r7, r9
 bge      .LChnEnd

.LChnLink:
 subs     r0, r0, #0x01
 bne      .LChnActMix

.LChnLinkEnd:
 ldmfd    sp!, {r0}
 str      r7, [r3, #0x04]
 b        .LChnLoopEnd                 @ goto .LChnLoopEnd

.LChnEnd:
 ands     r11, r4, #0x02
 beq      .LChnSmpEnd

.LChnSmpEndLoop:
 ldr      r11, [r3, #0x14]
 mov      r7, r11
 b        .LChnLink

.LChnSmpEnd:
 mov      r7, #0x00
 sub      r11, r4, #0x02
 str      r11, [r3]
 b        .LChnLinkEnd

.LChnNA:
.LChnLoopEnd:
 sub      r3, r3, #0x1C
 subs     r2, r2, #0x01                @ \
 bne      .LChnLoop                    @  if(r2-- != 0) goto .LChnLoop

.LDownsample:
 ldr      r1, =tmpBuffer-2
 ldr      r2, =sndVars
 ldr      r2, [r2]                     @ curmixbuffer
 sub      r2, r2, #0x01

.LDownsampleLoop:
 ldrh     r3, [r1, #0x02]!
 mov      r4, r3, lsr #0x08
 strb     r4, [r2, #0x01]!
 subs     r0, r0, #0x01
 bne      .LDownsampleLoop

.LEnd:
 ldmfd    sp!, {r1-r12}                @ pop {r1-r12}
 bx       lr                           @ bx to lr

.align 4

ClrVal:
 .word 0x00000000

tmpBuffer:
 .space 736*2

.end


The externs sndVars, sngVars and sndChannels are as follow:

Code:

typedef struct __attribute__ ((aligned(4))) {
 s8  *mixBufferBase;
 s8  *curMixBuffer;
 u8  activeBuffer;
 u32 smpsTilTick;
 u32 smpsPerTick;
 u32 mixFreq;
 u32 rcpMixFreq;
 u32 mixBufferSize;
} SND_VARS;

typedef struct __attribute__ ((aligned(4))) {
 u32        gVol;
 u32        mode;
 u32        state;
 u8         tickA;
 u8         tickB;
 u8         row;
 ... ... ...
} MOD_VARS;

typedef struct __attribute__ ((aligned(4))) {
 u32  cnt;
 u32 pos;
 u32 inc;
 u32 len;
 u32 loopStart;
 s8  *dat;
 u32 freq;
} SND_CHANNEL;

...

SND_VARS sndVars;
MOD_VARS sngVars;
SND_CHANNEL sndChannels[9];


I know, I know... lots of room for optimization but I just wanna get this working first. Thanks a lot guys.

#152535 - Ruben - Mon Mar 17, 2008 11:33 am

Oh, and BTW: I know... lots of old comments and typos lol...

#152547 - Cearn - Mon Mar 17, 2008 4:41 pm

Ruben wrote:
Code:
@ Mix down
.LChnActMix:
 ldrb    r11, [r10, r7, lsr #0x0C]    @ r12 = *(char*)(r10+r11)
 mul     r11, r12, r6                 @ r11 = r12 * r6 (CHN_VOL)

The comments do not match the code ... did you mean "ldrb r12, ..." ?

Some other points (for when it's actually working):
  • You don't need to save r0-r3 and r12 on the stack; the caller expects these to be clobbered.
  • IIRC, CpuFastSet (swi 0xC0000) is actually about 10% faster than DMA for filling large amounts of data (provided it's a multiple of 32 bytes long). if the situation allows for it, try using that some time as well. The swi's do have a substantial overhead though.
  • In .LChnActive, you're using stuff like "ldr r7, [r3, #0x04]!". You don't have to use write-back all the time. It's probably more readable if you didn't. Example:
Code:
   ldr      r7, [r3, #0x04]             @ chn->pos
   ldr      r8, [r3, #0x08]             @ chn->inc
   ldr      r9, [r3, #0x0C]             @ chn->len
   ldr      r10, [r3, #0x14]            @ chn->dat

This would also mean that the addition and subtraction you have now aren't necessary. Of course, if you want to be 1337 (and fast), you could use something like "ldmib r3, {r7-r9}", but you'd have to be careful where your pointer ends up. (Note: untested, but I think it should work.)
  • On
  • Code:
    .LChnEnd:
        ands     r11, r4, #0x02
        beq      .LChnSmpEnd

    .LChnSmpEndLoop:
        ldr      r11, [r3, #0x14]
        mov      r7, r11
        b        .LChnLink
    .LChnSmpEnd:
    Conditional opcodes are teh awesome:
    Code:
    .LChnEnd:
        tst      r4, #0x02         @ like ands, but sans extra register
        ldrne    r7, [r3, #0x14]   @ .LChnSmpEndLoop:
        bne     .LChnLink
    .LChnSmpEnd:

  • In
  • Code:
     ldrsh    r12, [r5]
     add      r11, r11, r12
     strh     r11, [r5]
     add      r5, r5, #0x02
    The store and addition can be combined. "strh r11, [r5], #2" would increment r5 by 2 after the store. Something similar is also useful in the down-sample part, which, if I read it correctly, currently decrements the pointers before the loop so that the "Rd, [Rn, Op2]!" parts work properly inside it.
    Have you checked what the compiler makes of it, by the way? The code doesn't seems too complicated, and I'm not sure the compiler would come up with something that much slower. That said, it's still a nice exercise :)

    #152585 - Ruben - Mon Mar 17, 2008 11:56 pm

    Hehehe... yeah, I did mean r12. Like I said: those were older comments. But I got it sorted now, it works fine (though it has a weird click...). So now I'm gonna try your suggestions. Thanks a lot!

    BTW: Yeah, I did check the code the compiler came up with. About 2-3 scanlines slower, which in a lot of channels adds up.

    #152586 - Miked0801 - Mon Mar 17, 2008 11:58 pm

    Always use ldm/stm ops when possible. They will save you 1-2 cycles per register used.

    On the Swi calls - pretty much don't do them unless you are copying huge amounts of info. There is somewhere in the neighborhood on 80 cycles of overhead getting in and out of the system crap. Pretty much, you are better off writing your own fast copy in IWRAM and calling that.

    In general when optimizing ARM code, if you see alot of mov and cmp/tst type instructions, there are ways to speed things up.

    The stack is your enemy. Don't use the stack if you can at all help it.

    Now to the code itself:
    In LChnLoop:
    instead of anding, you can use tst and preserve r5.

    In Active:
    Your comment says >> 2 but the code is lsr 3.
    You may be able to drop the first mov instuction by shifting 9 on the 2nd mov - overflow permitting.
    The 4 ldr instuctions feel like they could easily become an ldm instruction of some sort.
    Your last 2 instructions cancel each other. Add 4 then subtract 28 (0x1c) from the same register is the same as subtract 24 (0x18)
    Are you sure you have to push/pop r0? That's wasteful
    at LChnEnd: Your ands could be replace with tst to save a register.
    The 3 instructions after your and at LChnEnd should probably be conditionally executed
    Is writeback mode even allowed when accessing halfwords? I didn't think so, but if it compiles, fine.
    In your downsample loop, if the data is aligned properly, you can use 4 byte reads and 2 byte writes to get a nice speed up. You may also be able to unroll that loop a bit - data dependent.
    Don't save r1,r2,r3 or r12. Do use lr (r14) if you need an extra temp register.

    #152588 - Ruben - Tue Mar 18, 2008 12:34 am

    "Your comment says >> 2 but the code is lsr 3."

    Like I said, older comments :)

    "You may be able to drop the first mov instuction by shifting 9 on the 2nd mov - overflow permitting."

    What exactly do you mean? That kinda confused me.

    "The 4 ldr instuctions feel like they could easily become an ldm instruction of some sort."

    Done! :)

    "Your last 2 instructions cancel each other. Add 4 then subtract 28 (0x1c) from the same register is the same as subtract 24 (0x18)"

    Like I said, I wanted to get this working before optimizing :)

    "Are you sure you have to push/pop r0? That's wasteful"

    Unless there's some other way to save r0 somewhere else...

    "at LChnEnd: Your ands could be replace with tst to save a register."
    "The 3 instructions after your and at LChnEnd should probably be conditionally executed"

    Done. :)

    "In your downsample loop, if the data is aligned properly, you can use 4 byte reads and 2 byte writes to get a nice speed up. You may also be able to unroll that loop a bit - data dependent."

    Explain! It's confusing me LOL!

    #152603 - Cearn - Tue Mar 18, 2008 2:58 am

    As far as I can tell, this is the routine in C:
    Code:
    void SndMix(u32 nSamples)
    {
       // Clear tmp buffer
       volatile u32 fill= 0;
       REG_DMA3CNT= 0;
       REG_DMA3SRC= &fill;
       REG_DMA3DST= &tmpBuffer;
       REG_DMA3CNT= DMA_ENABLE | DNA_32 | DMA_SRC_FIXED;

       // Main variables
       u32 mainVol= sngVars.gVolume;
       SND_CHANNEL *chn= &sndChannels[8];
       u16 *buffy;

       // Channel loop
       for(ii=0; ii<9; ii++)
       {
          // (.LChnLoop)
          u32 cnt= chn->cnt;
          if(cnt&1)      // Only use active channels
          {
             // .LChnActive
             buffy= tmpBuffer;
             u32 vol= mainVol*(cnt>>3)>>6;   // p0 ?!? volume, orly ?
             u32 pos= chn->pos;            // Q12 number
             u32 inc= chn->inc;
             u32 len= chn->len;
             u8 *src= chn->dat;
             
             // Sample loop (.LChnActMix)
             for(jj=0; jj<nSamples; jj++)
             {
                buffy[jj] += src[pos>>12] * vol;

                pos += inc;
                if(pos > len)      // (.LChnEnd)
                {
                   // Check for looping sound
                   if(cnt & 2)      // (.LChnSmpEndLoop)
                   {
                      pos= chn->dat;       // p1 ?!?
                   }
                   else
                   {
                      chn->cnt -= 2;      // p2 ?!?
                      pos= 0;
                      break;
                   }
                }
             }
             chn->pos= pos;
          }
          chn--;
       }

       // Down-sample
       buffy= tmpBuffer;
       u8 *dst= sndVars.mixBufferBase;
       for(jj=0; jj<nSamples; jj++)
          *dst++ = *buffy++;
    }
    If so, I have 5 questions:

    What is the usual size of nSamples ? Order of magnitude figure will do.

    What happens if the sum of the channels exceeds 16 bits?

    (p0) : How exactly is the channel volume hidden inside chn->cnt ? Is it really bits 3 to 31 ?

    (p1) : for looping channels, the asm sets the position to chn->dat. Shouldn't this be chn->loopStart ?

    (p2) : why subtract the control by 2. Isn't the 'on' bit in the first bit, not the second?


    Miked0801 wrote:
    Is writeback mode even allowed when accessing halfwords? I didn't think so, but if it compiles, fine.

    You're probably thinking about a shifted Op2. Those don't work for halfwords, but writeback is fine.

    Ruben wrote:
    Miked0801 wrote:
    You may be able to drop the first mov instuction by shifting 9 on the 2nd mov - overflow permitting.


    What exactly do you mean? That kinda confused me.

    "a * (b>>3) >> 6" should be pretty close to "(a*b)>>3>>6 = (a*b)>>9". The difference is a*(b%8)>>6. If a*7 is smaller than 64 (and if a*b fits into 32 bits), you can use the >>9 form without any consequence. If a*7 > 64, there would be a very small but possibly inaudible difference.

    Ruben wrote:
    Miked0801 wrote:
    Are you sure you have to push/pop r0? That's wasteful

    Unless there's some other way to save r0 somewhere else...

    You still have the link register (lr / r14) left over that you can use don't forget to stack it, of course. I think it's possible to reduce the register usages by one or two by the way.

    Ruben wrote:
    Miked0801 wrote:
    In your down-sample loop, if the data is aligned properly, you can use 4 byte reads and 2 byte writes to get a nice speed up. You may also be able to unroll that loop a bit - data dependent.


    Explain! It's confusing me LOL!

    Right now you're converting one halfword into one byte per loop. If the source pointer is 32-bit aligned (which it is) and the sndVars buffer is halfword aligned, you can cover two samples in one loop like this:

    Code:
       @ Assuming :
       @ r0 : u16 *dst= tmpBuffer;
       @ r1 : u32 *src= sdnVars.mixBufferBase;
       @ r2 : nSamples
       @ r3 : data
       @ r4 : mask (0x00FF00FF)
       
       mov      r4, 0xFF
       mov      r4, r4, lsl #16
    .LDownSampleLoop:
       ldr      r3, [r1], #4         @ p = 0xDDccBBaa
       and      r3, r4, r3, lsr #8      @ p = m & p>>8 = 0x00DD00BB
       orr      r3, r3, r3, lsr #8      @ p = p | p>>8 = 0x00DDDDBB
       strh   r3, [r0], #2         @ store 0xDDBB
       subs   r2, r2, #2
       bne      .LDownSampleLoop

    This can also be extended to a 4 halfword -> 4 byte loop with ease. And then unrolled to get rid of some loop overhead.

    #152606 - Ruben - Tue Mar 18, 2008 3:15 am

    (p0) : How exactly is the channel volume hidden inside chn->cnt ? Is it really bits 3 to 31 ?

    Yeahuh. I don't really like using halfwords for stuff (in case you didn't already notice lol).

    (p1) : for looping channels, the asm sets the position to chn->dat. Shouldn't this be chn->loopStart ?

    ... ... ... it does? *double checks and fixes it up*

    (p2) : why subtract the control by 2. Isn't the 'on' bit in the first bit, not the second?

    Hehe. Yeah, it's fixed up in my new code

    "(a*b)>>3>>6 = (a*b)>>9"

    Is that basically (assuming r4 is the destination, r5: a, r6: b)
    Code:
    mul r7, r5, r6
    mov r4, r7, lsr #0x09


    "You still have the link register (lr / r14) left over"

    Wouldn't that mean I still have to do some stacking?

    "If the source pointer is 32-bit aligned (which it is) and the sndVars buffer is halfword aligned, you can cover two samples in one loop"

    Since I'm not really sure what you mean, I'm assuming you mean something like align 4... IDK... lol...

    #152608 - Ruben - Tue Mar 18, 2008 4:23 am

    Oh, and in regards to that code:
    1) Why is the tmpBuffer the destination?
    2) Why is the 'source' 'mixBufferBase' and not 'curMixBuffer'? IDK if those weren't typos but when I tried that code, NO$GBA just crashed...

    #152610 - Ruben - Tue Mar 18, 2008 5:06 am

    OK. After trying out that code (and changing it around to make it work! lol), I figured it didn't really speed up much so I'm just gonna leave it. This is my newest code. Haven't double checked the comments so ignore them... Any room for optimization?

    Code:
    SndMix:
     stmfd    sp!, {r4-r11,lr}             @ push {r4-r11,lr}
     add      r5, r0, #0x01                @ r5    = r0 + 0x01
     ldr      r1, =0x040000D4              @ r1    = 0x040000D4 (DMA3SAD)
     mov      r2, #0x00000000              @ r2    = 0x00000000 (NULL)
     str      r2, [r1, #0x08]              @ *(r1 + 0x08) = r2  (DMACNT)
     ldr      r2, =ClrVal                  @ r2    = &ClrVal
     ldr      r3, =tmpBuffer               @ r3    = &sndTmpBuffer
     mov      r4, #0x85000000              @ r4    = 0x85000000 (MEMSET32)
     add      r4, r4, r5, lsr #0x01        @ r4   += r5 >> 0x01
     stmia    r1, {r2-r4}                  @ SAD = r2, DAD = r3, CNT = r4
     ldr      r1, =sngVars                 @ r1    = &sngVars
     ldr      r1, [r1]                     @ r1    = sngVars.gVol
     mov      r2, #0x09                    @ r2    = 0x09
     ldr      r3, =sndChannels+0xE0        @ r3    = &sndChannels[8]

    .LChnLoop:
     ldr      r4, [r3]                     @ r4  = chn->cnt
     tst      r4, #0x01                    @ \
     beq      .LChnNA                      @  if(!(r4 & 0x01)) goto .LChnLoopEnd

    .LChnActive:
     ldr      r5, =tmpBuffer               @ r5  = &tmpBuffer
     mul      r7, r4, r1                   @ r7  = cnt * gVol
     mov      r6, r7, lsr #0x09            @ r6  = r7 >> 9 (VOLUME)
     ldmib    r3, {r7-r10}                 @ r7 = chn->pos, r8 = chn->inc,
                                           @ r9 = chn->len, r10 = chn->data
     mov      lr, r0                       @ lr = samplesToMix

    .LChnActMix:
     mov      r11, r7, lsr #0x0C           @ r11  = r7 >> 0x0C
     ldrsb    r12, [r10, r11]              @ r12  = *(char*)(r10+r11)
     mul      r11, r12, r6                 @ r11  = r12 * r6 (VOLUME)
     ldrsh    r12, [r5]                    @ r12  = *(short*)(r5)
     add      r11, r11, r12                @ r11 += r12
     strh     r11, [r5], #0x02             @ r11  = r5; r5 += 2
     add      r7, r7, r8                   @ r7  += r8 (chn->inc)
     cmp      r7, r9                       @ \
     bge      .LChnEnd                     @  if(chn->pos >= chn->len) goto .LChnEnd

    .LChnLink:
     subs     lr, lr, #0x01                @ \
     bne      .LChnActMix                  @  if(--r0) goto .LChnActMix

    .LChnLinkEnd:
     str      r7, [r3, #0x04]              @ (r3 + 0x04) = r7 (chn->pos)
     b        .LChnLoopEnd                 @ goto .LChnLoopEnd

    .LChnEnd:
     tst      r4, #0x02                    @\
     ldrne    r7, [r3, #0x14]              @ if(chn->cnt & LOOP)
     bne      .LChnLink                    @  r7 = loopStart; goto .LChnLink

    .LChnSmpEnd:
     mov      r7, #0x00                    @ r7  = 0x00 (chn->pos)
     sub      r11, r4, #0x01               @ r4  = r11 - 0x01 (CHN_ACT)
     str      r11, [r3]                    @ r3  = r11
     b        .LChnLinkEnd                 @ goto .ChnLinkEnd

    .LChnNA:
    .LChnLoopEnd:
     sub      r3, r3, #0x1C                @ r3 -= sizeof(channel)
     subs     r2, r2, #0x01                @ \
     bne      .LChnLoop                    @  if(--r2) goto .LChnLoop

    .LDownsample:
     ldr      r1, =sndVars                 @ r1     = &sndVars
     ldr      r2, [r1, #0x04]              @ r2     = sndVars->curMixBuffer
     ldr      r3, =tmpBuffer               @ r3     = &tmpBuffer
     mov      r4, r2                       @ r4     = sndVars->curMixBuffer
     add      r4, r4, r0                   @ r4    += samplesToMix
     str      r4, [r1, #0x04]              @ curMixBuffer = r4

    .LDownsampleLoop:
     ldrh     r4, [r3], #0x02              @ r4 = *(short*)(r3); r3 += 0x02
     mov      r4, r4, lsr #0x08            @ r4 >>= 0x08
     strb     r4, [r2], #0x01              @ r2 = r4; r2++
     subs     r0, r0, #0x01                @ \
     bne      .LDownsampleLoop             @  if(--r0) .LDownsampleLoop

    .LEnd:
     ldmfd    sp!, {r4-r11,lr}             @ pop {r4-r11,lr}
     bx       lr                           @ bx to lr

    .align 4

    ClrVal:
     .word 0x00000000

    tmpBuffer:
     .space 736*2

    And I also changed this around:
    Code:
    typedef struct __attribute__ ((aligned(4))) {
     u32  cnt;
     
     //bit 1 = on, bit 2 = looped, bit 3-32 = vol
     
     u32 pos;
     u32 inc;
     u32 len;
     s8  *dat;
     u32 loopStart;
     u32 freq;
    } SND_CHANNEL;

    #152614 - Cearn - Tue Mar 18, 2008 5:52 am

    This is what I came up with ...

    Code:
    SndMix:
        stmfd   sp!, {r4-r11, r14}

        @ Clear buffer
        mov     r2, #0x04000000
        add     r1, r2, #0xD4
        str     r2, [r1, #8]            @ Clear REG_DMA3CNT
        ldr     r2,=ClrVal
        ldr     r3,=tmpBuffer
        add     r4, r0, #1
        add     r4, r4, r4, lsr #1
        add     r4, r4, 0x85000000
        stmfd   r1!, {r2-r4}
       
        @ Set-up main loop:
        ldr     r1,=sngVars
        ldr     r1, [r1]                @ master volume
        ldr     r6,=sndChannels         @ chn
        mov     r2, #9                  @ ii

        @ Register list:
        @ r0 :  nSamples
        @ r1 :  mainVol
        @ r2 :  ii
        @ r3 :  tmpBuffer
        @ r4 :  dst
        @ r5 :  vol
        @ r6 :  chn
        @ r7 :  cnt
        @ r8 :  pos
        @ r9 :  inc / mix-data
        @ r10:  len
        @ r11:  (start) / data
        @ r12:  src
        @ r14:  jj

    .LChnLoop:
            ldr     r7, [r6]            @ cnt
            tst     r7, #1
            beq     .LChnLoopInc

            ldmib   r6, {r8-r12}        @ r8:pos, r9:inc, r10:len, r12:src
            mul     r5, r1, r7          @ vol
            mov     r5, r5, lsr #9
            mov     r14, r0             @ jj
            mov     r4, r3              @ buffer
    .LMixLoop:
                ldrb    r11, [r12, r8, lsr #12]
                add     r8, r8, r9          @ pos += inc. Frees r9 for mix
                ldrh    r9, [r4]
                mla     r9, r5, r11, r9     @ mix += sample*vol
                strh    r9, [r4], #2

                @ Test for end-of-data
                cmp     r8, r10
                bge     .LEndOfData
    .LMixLoopInc:
                subs    r14, r14, #1
                bne     .LMixLoop
    .LPostMixLoop:
            str     r8, [r6, #4]
    .LChnLoopInc:
            add     r6, r6, 0x1C
            subs    r2, r2, #1
            bne     .LChnLoop
        b       .LPostChnLoop

                @ Out of data: special cases
    .LEndOfData:
                tst     r7, #2      @ Check for looping channel

                @ if(loop) then start over
                ldreq   r8, [r6, #0x14]
                beq     .LMixLoopInc

                @ else  reset pos, disable channel and bug out
                mov     r8, #0
                bic     r7, r7, #1
                str     r7, [r3]
                b       .LPostMixLoop

        @ Channel and mix loops done. Yay. Prep for down-sample.
    .LPostChnLoop:
       
        @ Down sample:
        @ r0: nSamples
        @ r1: data
        @ r2: curMixBuffer
        @ r3: tmpBuffer

        ldr     r2,=sndVars
        ldr     r2, [r2, #4]    @ sndVars.curMixBuffer ?!?
        add     r2, r2, #1      @ Yes, I'm moving one up
    .LDownSampleLoop:
            ldrb    r1, [r3], #2    @ get top-byte only
            strb    r1, [r2], #1
            subs    r0, r0, #1
            bne     .LDownSampleLoop

        ldmfd   sp!, {r4-r11, r14}
        bx      lr

    It's almost the same as what you have now. I should probably point out that it's *cough* nearly 6am here, so the code above may not be accurate.

    The main differences are that I'm not reloading tmpBuffer anywhere and that you're now using signed data instead of unsigned (which should it be, signed or unsigned?). You're also updating the mix-buffer now, something that wasn't there before.

    About the number of samples: is this guaranteed to be even ? If it's odd, the rounding you apply for the tmpBuffer will adds an extra sample that isn't actually there.

    There's also an interesting little tidbit about the mul instruction that you might want to know. The cycle-time for it is 1S + mI, with m being the number of significant bytes in the second operand. If you order the operands so that the second one is smaller, you may gain up to 3 cycles (the equivalent of 3 instructions).

    EDIT : loop indentation


    Last edited by Cearn on Tue Mar 18, 2008 7:54 pm; edited 1 time in total

    #152635 - Miked0801 - Tue Mar 18, 2008 7:09 pm

    Nice looking code Cearn. Well formated and commented. It's hard enough to read assembler without comments :) The way you downsample is also very well done.

    The code looks pretty tight. The only thing I am wondering at this point is about the architecture/algorithms used. The inner loops are a mess. They're interleaved and the system is forced to jump back and forth. It also required both an end of buffer check and a decrement check. It seems to me that either one or the other should be sufficient with just a little care on the data creation side. Also, depending on where in RAM this is being stored, buffering the output to words may make sense.

    Also,there is the possibly of using 4 byte reads and possibly buffering the output the 4-byte writes in the down sampler - but I have no idea if that is even needed because without data, I can't tell how much time is spent in any of the loops.

    Finally, as this uses DMA, be very careful not to mix this code with other DMA code across interrupts. That causes very hard to track bugs.

    #152644 - Cearn - Tue Mar 18, 2008 8:34 pm

    Miked0801 wrote:
    Nice looking code Cearn. Well formated and commented. It's hard enough to read assembler without comments :) The way you downsample is also very well done.

    The code looks pretty tight. The only thing I am wondering at this point is about the architecture/algorithms used. The inner loops are a mess. They're interleaved and the system is forced to jump back and forth.

    Actually, they aren't really interleaved, but since I messed up with my indentation I can see how it may have appeared that way. There are two loops, ChnLoop and MixLoop. They run like this:

    Code:
    .LChnLoop:
        init for mixer
       
    .LMixLoop:
            mix samples

            (end-of-data check)

            subs    r14, r14, #1
            bne     .LMixloop

    .LPostMixLoop:
        update chn->pos

        subs    r2, r2, #1
        bne     .LChnLoop

    The mix loop runs from .LMixLoop to .LPostMixLoop, and is completely contained witin the channel loop. But what about .LEndOfData ? .LEndOfData is separate from both loops. It essentially functions as a sub-routine. Now, it does 'return' to two different places: one still inside the mix loop, and one after it. The reason for the different exits is that the latter return would result in a 'break' statement (see the C version) inside the look, and you can avoid that by simply skipping past the loop parts from the sub-routine.

    The alternative is to put the end-of-data handler inside the mix loop and skip over it if pos is not out of bounds. Since branches are evil and pos will usually be smaller than len (ok, so that's an assumption, but it makes sense), it's better to have two infrequent branches than one frequent one. GCC actually does the same ... although it wastes quite a bit of energy with the stack and member loading.

    You could also place the whole .LEndOfData part after the bx so that the "b .LPostChnLoop" can be removed.

    Miked0801 wrote:
    It also required both an end of buffer check and a decrement check. It seems to me that either one or the other should be sufficient with just a little care on the data creation side.

    Yeah, it there maybe ways around it, but you still need to have separate checks for the number of samples and the end of the channel data in some form. If pos were decreasing, you could use subs/bmi instead of add/cmp/bge.

    You can also fiddle with the chn updating in the channel loop and remove one or two instructions there, but I'm not sure that's really worth it; the loop only being executed 9 times and all.

    #152667 - Ruben - Wed Mar 19, 2008 3:38 am

    Question:

    Code:
    .LEndOfData:
                tst     r7, #2      @ Check for looping channel

                @ if(loop) then start over
                ldreq   r8, [r6, #0x14]
                beq     .LMixLoopInc

    Did you mean 'ldrne' and 'bne'?

    #152680 - Cearn - Wed Mar 19, 2008 12:06 pm

    Ruben wrote:
    Question:

    Code:
    .LEndOfData:
                tst     r7, #2      @ Check for looping channel

                @ if(loop) then start over
                ldreq   r8, [r6, #0x14]
                beq     .LMixLoopInc

    Did you mean 'ldrne' and 'bne'?

    Uhm, yeah. If a set bit 1 means looping sound, then it should be -ne.

    #152694 - tepples - Wed Mar 19, 2008 7:59 pm

    Cearn wrote:
    EDIT : loop indentation

    Interesting. Is there any sort of commonly accepted style for indentation of assembly language?
    _________________
    -- Where is he?
    -- Who?
    -- You know, the human.
    -- I think he moved to Tilwick.

    #152710 - Dwedit - Wed Mar 19, 2008 11:02 pm

    tepples wrote:
    Cearn wrote:
    EDIT : loop indentation

    Interesting. Is there any sort of commonly accepted style for indentation of assembly language?

    Whatever doesn't cause syntax errors. Some assemblers require labels to be aligned to the left column.
    _________________
    "We are merely sprites that dance at the beck and call of our button pressing overlord."

    #152905 - Cearn - Sat Mar 22, 2008 12:59 pm

    tepples wrote:
    Cearn wrote:
    EDIT : loop indentation

    Interesting. Is there any sort of commonly accepted style for indentation of assembly language?
    Not that I know of, no. But using indentation makes loops stand out and hence improve readability.

    Loops generally follow this pattern:
    Code:
       pre-loop code
    .LLoop:
       loop code
       ...
       test for exit
       bne .LLoop
       post-loop code

    I find it easier to see where the loop ends if I use
    Code:
       pre-loop code
    .LLoop:
          loop code
          ...
          test for exit
          bne .LLoop
       post-loop code

    #152908 - eKid - Sat Mar 22, 2008 2:09 pm

    I prefer the format

    Code:

    label:
            instr   reg, reg
            instr   reg, xxx
            blah    blah
    label2:
            etc     reg, #???

    i.e with labels left aligned, the instructions on the second tab, and parameters on the third tab (tabsize 8). *Personally*, I think indentation throws around the parameters, and makes it harder to read.

    #155115 - Ruben - Fri Apr 25, 2008 3:44 am

    OK, so I've moved on to a higher quality mixing routine and decided to use Cearn's 4byte -> 2byte idea. But when I tried it, it gave strange clicks and, according to No$GBA, it contains bugs. This is my loop ATM:

    Code:
    @ r0: samples to mix
    @ r1: address of pS3MVars, which contains the current location of the mix buffer, etc
    @ r2: current address of mix buffer
    @ r3: address of tmpBuffer
    @ pS3MVars + 0x18 = current mix buffer address
    .LPreMixdownLoop:
     ldr        r1, =pS3MVars
     ldr        r2, [r1, #0x18]
     ldr        r3, =tmpBuffer
     
     add        r4, r2, r0               @ update the location of the mix buffer
     str        r4, [r1, #0x18]

     mov        r4, #0xFF
     orr        r4, r4, r4, lsl #0x10    @ m = 0x00FF00FF

    .LMixdownLoop:
     ldr        r5, [r3], #0x04          @ d =              0xAAbbCCdd
     and        r5, r4, r5, lsr #0x08    @ d = d & m >> 8 = 0x00AA00CC
     orr        r5, r5, r5, lsr #0x08    @ d = d | d >> 8 = 0x00aaAACC
     strh       r5, [r2], #0x02
     ldr        r5, [r3], #0x04          @ d =              0xAAbbCCdd
     and        r5, r4, r5, lsr #0x08    @ d = d & m >> 8 = 0x00AA00CC
     orr        r5, r5, r5, lsr #0x08    @ d = d | d >> 8 = 0x00aaAACC
     strh       r5, [r2], #0x02
     
     subs       r0, r0, #0x04
     bpl        .LMixdownLoop

    #155116 - Ruben - Fri Apr 25, 2008 3:51 am

    Oh, yeah. Cearn, you asked how many samples it mixes at a time right? Well, I'm not exactly sure of the figure but this is the code which calls the function:

    Code:
    //in sound.h...
    #define S3M_TICKFPA 8
    #define S3M_BUFSIZE 528

    //in sound.c...
    int pS3MSetTempo(int tempo) {
     pS3MVars.sTT -= pS3MVars.sPT;
     pS3MVars.sPT  = pS3MDivide(pS3MVars.mFr*5<<S3M_TICKFPA, tempo*2); //smps p/tick = (mix freq * 5 << S3M_TICKFPA) / (tempo * 2)
     pS3MVars.sTT += pS3MVars.sPT;
     
     return 0;
    }

    int gS3MUpdate() {
     int sL = S3M_BUFSIZE, sTT;
     int (*mix)();
     
     if(pS3MModVars.sigMode == 1) mix = pS3MMixdownS;
     else mix = pS3MMixdown;
     
     while(sL) {
      if(pS3MVars.sTT < (1 << S3M_TICKFPA)) {
       pS3MVars.sTT += pS3MVars.sPT; //samples til tick += smps p/tick
       pS3MUpdateMod(); //process the row
      }
     
      sTT = pS3MVars.sTT >> S3M_TICKFPA;
      if(sTT < sL) {
       mix(sTT);
       sL -= sTT;
       pS3MVars.sTT -= (sTT << S3M_TICKFPA);
      } else {
       sTT = sL;
       
       mix(sTT);
       pS3MVars.sTT -= (sL  << S3M_TICKFPA);
       return -1;
      }
     }
     
     return 0;
    }