gbadev.org forum archive

This is a read-only mirror of the content originally found on forum.gbadev.org (now offline), salvaged from Wayback machine copies. A new forum can be found here.

ASM > Looking for some optimization tips (ima-adpcm)

#167050 - DiscoStew - Fri Feb 27, 2009 8:23 pm

After eKid released his ms-adpcm decoder using assembly for fast stream decoding, I wanted to improve my own ima-adpcm decoding by using assembly as well. With his permission, I used the general template from his decoder, made some changes to some structures, and made the decoding code from scratch in assembly, which took some time to do because of my inexperience with dealing with assembly upfront. It all works, but because of my inexperience, I have no idea if there is any room for improving it. So, in order for me to learn, I've posted the decoding core below, and maybe some of you could teach me the ropes.

The interface mimics eKid's own ms-adpcm decoder, but the input data pointer is for my own structure, also listed below. If there are any question as to why I'm doing X and Y, just ask me.

NOTE: For those currently using my C/C++ implemented ima-adpcm decoder, do not switch to using this, because the structuring is different. When improvements are made (if any can be) and code is organized, I will release it.

Code:

typedef struct t_IMA_Adpcm_Data {

   int curSamps;
   int data1, step1, samp1;
   int data2, step2, samp2;
} IMA_Adpcm_Data;

Code:

.global decode_stereo_ima_adpcm

.text
.arm
.align

#define cur      r0
#define da1      r4
#define st1      r5
#define sa1      r6
#define da2      r7
#define st2      r8
#define sa2      r9

#define   ta      r10
#define   tb      r11
#define tc      r12

#define istab   r14

#define src      r1
#define dest   r2
#define iter   r3


/*   
   SampXCode = Stream_CurSamples & 0xF;
   Stream_CurSamples >>= 4;
   Diff = (((( SampXCode & 0x7 ) << 1 ) + 1 ) * STREAM_StepTab[ Stream_StepIndex ]) >> 3;
   Stream_StepIndex += STREAM_IndexTab[ SampXCode ];
   Stream_LastSample += ( SampXCode & 0x8 ? -Diff : Diff );
   
   if( Stream_StepIndex > 88 )         Stream_StepIndex = 88;
   if( Stream_StepIndex < 0 )      Stream_StepIndex = 0;
   if( Stream_LastSample > 32767 )      Stream_LastSample = 32767;
   if( Stream_LastSample <= -32768 )   Stream_LastSample = -32768;
   *target++ = Stream_LastSample;
   
 */

/*************************************************************
 *                                                           *
 * decode_stereo_ima_adpcm( data, source, target, iterations ) *
 *                                                           *
 *************************************************************/

decode_stereo_ima_adpcm:

   push   {r0,r4-r11,lr}
   ldmia   r0, {cur,da1,st1,sa1,da2,st2,sa2}
   adr      istab, Index_Step_Table

.examine_cur:
   cmp   cur, #0
   beq   .reloadwords
.decode_word_loop:

/*************** Left channel *********************/

   ldr      tc, [istab, st1, lsl#2]                  // tc = Step_Index_Table[ Step_Index1 ]

   and      ta, da1, #15                        // ta = CurSamples1 & 0xF
   lsr      da1, da1, #4                        // CurSamples1 >>= 4
   and      tb, ta, #7                           // tb = ta & 0x7
   cmp      ta, #8                              // Is ta >= 8 (ExamineA)
   
   add      ta, #1                              // ta += 1
   ldr      ta, [istab, -ta, lsl#2]                  // ta = StepIndex_Table[ -ta ]
   add      st1, ta                              // Step_Index1 += ta
   
   lsl      tb, #1                              // Diff = ((( tb << 1 ) + 1) * tc) >> 3
   add      tb, #1
   smulbb   ta, tb, tc
   asr      ta, #3
   
   subge   sa1, ta                              // Samp1 -= ta if ExamineA is true
   addlt   sa1, ta                              // Samp1 += ta if ExamineA is false
   
/*************** Right channel ********************/

   ldr      tc, [istab, st2, lsl#2]                  // tc = Step_Index_Table[ Step_Index2 ]

   and      ta, da2, #15                        // ta = CurSamples2 & 0xF
   lsr      da2, da2, #4                        // CurSamples2 >>= 4
   and      tb, ta, #7                           // tb = ta & 0x7
   cmp      ta, #8                              // Is ta >= 8 (ExamineB)
   
   add      ta, #1                              // ta += 1
   ldr      ta, [istab, -ta, lsl#2]                  // ta = StepIndex_Table[ -ta ]
   add      st2, ta                              // Step_Index2 += ta

   lsl      tb, #1                              // Diff = ((( tb << 1 ) + 1) * tc) >> 3
   add      tb, #1
   smulbb   ta, tb, tc
   asr      ta, #3

   subge   sa2, ta                              // Samp2 -= ta if ExamineB is true
   addlt   sa2, ta                              // Samp2 += ta if ExamineB is false
   

/************* Clamp Step Indexes *******************/

   cmp      st1, #89
   cmplt   st2, #89
   bge      .satihi
1:
   cmp      st1, #0
   cmpge   st2, #0
   blt      .satilo

/**************** Clamp Samples *********************/

2:
   cmp      sa1, #32768
   cmplt   sa2, #32768
   bge      .sathi
3:
   cmp      sa1, #-32768
   cmpge   sa2, #-32768
   blt      .satlo

/*************** Write Samples *********************/

4:   
   strh   sa1, [dest], #2
   strh   sa2, [dest], #2
      
   sub      cur, #1
   subs   iter, #1
   bne      .examine_cur
   
   pop      {ta}
   stmia   ta, {cur,da1,st1,sa1,da2,st2,sa2}

   pop      {r4-r11,pc}

.reloadwords:
   mov      cur, #8
   ldr      da1, [src], #4
   ldr      da2, [src], #4
   b      .decode_word_loop

.satihi:
   cmp      st1, #89
   ldrge   st1, [istab, #-68]
   cmp      st2, #89
   ldrge   st2, [istab, #-68]
   b      1b

.satilo:
   cmp      st1, #0
   ldrlt   st1, [istab, #-72]
   cmp      st2, #0
   ldrlt   st2, [istab, #-72]
   b      2b
   
.sathi:
   cmp      sa1, #32768
   ldrge   sa1, [istab, #352]
   cmp      sa2, #32768
   ldrge   sa2, [istab, #352]
   b      3b

.satlo:
   cmp      sa1, #-32768
   ldrlt   sa1, [istab, #356]
   cmp      sa2, #-32768
   ldrlt   sa2, [istab, #356]
   b      4b

.word   0
.word   88
.word   8, 6, 4, 2, -1, -1, -1, -1
.word   8, 6, 4, 2, -1, -1, -1, -1
Index_Step_Table:
.word   7, 8, 9, 10, 11, 12, 13, 14
.word   16, 17, 19, 21, 23, 25, 28, 31
.word   34, 37, 41, 45, 50, 55, 60, 66
.word   73, 80, 88, 97, 107, 118, 130, 143
.word   157, 173, 190, 209, 230, 253, 279, 307
.word   337, 371, 408, 449, 494, 544, 598, 658
.word   724, 796, 876, 963, 1060, 1166, 1282, 1411
.word   1552, 1707, 1878, 2066, 2272, 2499, 2749, 3024
.word   3327, 3660, 4026, 4428, 4871, 5358, 5894, 6484
.word   7132, 7845, 8630, 9493, 10442, 11487, 12635, 13899
.word   15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794
.word   32767
.word   -32768

_________________
DS - It's all about DiscoStew

#167055 - eKid - Sat Feb 28, 2009 12:10 am

Coolness :)

DiscoStew wrote:
Code:
   ldr      ta, [istab, -ta, lsl#2]                  // ta = StepIndex_Table[ -ta ]
   add      st1, ta                              // Step_Index1 += ta

Code:
   smulbb   ta, tb, tc
   asr      ta, #3

LDR and SMULxy (and some others) have an 'interlock' cycle if you use the target register right after the instruction. If you rearrange the instructions below to fit between the load/multiply and the use you can save a cycle here and there.

(example)
Code:
   ldr      ta, [istab, -ta, lsl#2]                  // ta = StepIndex_Table[ -ta ]
   lsl      tb, #1                              // Diff = ((( tb << 1 ) + 1) * tc) >> 3 (avoid interlock)
   add      st1, ta                              // Step_Index1 += ta

#167070 - Miked0801 - Sat Feb 28, 2009 5:32 am

When I see this

Code:

if( Stream_StepIndex > 88 )         Stream_StepIndex = 88;
   if( Stream_StepIndex < 0 )      Stream_StepIndex = 0;
   if( Stream_LastSample > 32767 )      Stream_LastSample = 32767;
   if( Stream_LastSample <= -32768 )   Stream_LastSample = -32768;


it reminds me of the ARM9 saturation math functions (used for clamping). I'm at home so I don't have the exact op-codes in front of me, but they would probably be very useful for those types of operations.

#167072 - Ruben - Sat Feb 28, 2009 6:02 am

I'm not really sure, since I don't code for the DS yet.. but couldn't this be optimized...

Code:
   asr      ta, #3
   subge   sa1, ta                              // Samp1 -= ta if ExamineA is true
   addlt   sa1, ta                              // Samp1 += ta if ExamineA is false

; To this...

   subge   sa1, ta, asr #3
   addlt   sa1, ta, asr #3

#167073 - Dwedit - Sat Feb 28, 2009 6:11 am

Thanks for telling me about the saturating arithmetic instructions on the ARM9, never would have noticed them otherwise.

Looks like the relevant instruction is SSAT or USAT.
SSAT r0, #16, r1 would take R1, force it to be between -32768 and 32767, then store the result in r0.
USAT r0, #31, r1 take r1, force it to be between 0 and 2^31, etc...

I could see USAT as being used by C compilers to turn a bitwise operator into a "logical" operator which needs to return a 1 or zero.
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."

#167074 - DiscoStew - Sat Feb 28, 2009 6:45 am

Documentation I have says that SSAT and USAT are for the ARMv6 and above. But, isn't the ARM9's architecture the ARMv5TE? Guess it doesn't hurt to at least see if it compiles.

EDIT:

Nope, doesn't compile.
_________________
DS - It's all about DiscoStew

#167080 - eKid - Sat Feb 28, 2009 12:29 pm

I sure could use some saturation instructions. :/

#167094 - DiscoStew - Sat Feb 28, 2009 8:28 pm

The only saturation instructions I could see that worked on the ARM9 were QADD and QSUB, both of which saturate to signed 32bit integers, but I gave it a shot anyways, having shifted my initial samples from 16bit to 32bit, and just working with them like that. While I got it working, I didn't really see any change to processing time. Then again, I was still working with tables set at 16bit, requiring me to make a few shifts each loop, so if I convert those up to 32bit, I can remove those shifts, and should get some speed. I'll give that a try.

EDIT:

Got QADD and QSUB working "correctly", but now instead of using smul, I ended up using just mul, which is an extra cycle in similar situations if I'm correct. :(

Maybe I can fix that.
_________________
DS - It's all about DiscoStew

#167103 - DiscoStew - Sun Mar 01, 2009 7:12 am

Well, with all the suggestions given, and with the use of QADD and QSUB, my routines are roughly twice as fast as current c++ counterparts, both mono and stereo. I'm happy with it for the moment.

If those weren't the saturation instructions you were thinking of Miked0801, then I'd like to know about the ones you were talking about. Would be nice not to shift samples and decoding to 32bit just to make use of the current ones.
_________________
DS - It's all about DiscoStew

#167108 - Miked0801 - Sun Mar 01, 2009 4:49 pm

Nope, I believe those were the ones. I'll double check on Monday when I'm back in the office and have my trusty ARM ARM manual in hand.

#175095 - sverx - Wed Sep 08, 2010 3:00 pm

I know this is a quite old topic, but I was curious to know if there were news... it all started because I was searching info on the net about ARM9 and I found this document which -to my weak knowledge- seems related to ARM9.

The fact is that -if I understand correctly what's in the doc- there should be a QADD16 instruction that should be able to perform two saturated addictions on 16 bits operands in the same moment.

Am I wrong? Where I can find good documents about ARM9, then?

Thanks :D

#175096 - Ruben - Wed Sep 08, 2010 3:09 pm

Saturated addictions, aye? ;D

Joking aside:
If I'm reading it correctly, the the halfword extensions to ADD are only available to the ARM6 architecture (IIRC, the ARM9 is of the ARM5E arch), and QADD16 doesn't exist.

EDIT:
And quick note about clamping:
Code:
@ old code
cmp   r0, #0 @ if(r0 < 0)
movlt r0, #0 @ r0 = 0

@ new code
bic r0, r0, r0, asr #31 @ clear everything if top bit [neg] is set


And as for clamping to the sample range [if you have enough registers]:
Code:
@ before loop
ldr ip, =0x7FFF @ positive limit

@ during loop
@ old code
cmp   r0, #-32768
movlt r0, #-32768
cmp   r0, #32767
movgt r0, #32767

@ new code
@ not totally safe, but usually is enough
teq   r0, r0, lsl #16 @ signs differ?
submi r0, ip, r0, asr #31 @ yes - clamp

#175097 - sverx - Wed Sep 08, 2010 3:28 pm

Wow, you were too fast, I just came here to post I realized these instructions comes from version 6...

(is it saturating addictions)?

#175098 - Ruben - Wed Sep 08, 2010 3:53 pm

It's "additions", not "addictions" ;D

#175101 - wintermute - Thu Sep 09, 2010 2:05 am

The DS ARM9 is an ARM946E-S which does have QADD/QSUB instructions. See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.set.arm9/index.html

The Parallel QADD8/16 instructions aren't available though. http://www.keil.com/support/man/docs/armasm/armasm_cjafgdih.htm
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

#175103 - sverx - Thu Sep 09, 2010 2:30 pm

Ruben wrote:
It's "additions", not "addictions" ;D


LOL! Now I see... I wish I had studied English better when I was at school... now it's quite late ;)

#175104 - kusma - Thu Sep 09, 2010 3:54 pm

Since you're using QADD, this means you're doing this for the DS. Why not just use the hardware-support for ADPCM?

#175105 - wintermute - Thu Sep 09, 2010 4:30 pm

You can't use the hardware to stream ADPCM
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

#175106 - kusma - Thu Sep 09, 2010 4:37 pm

wintermute wrote:
You can't use the hardware to stream ADPCM

Why not? How is streaming any different from just breaking things into chunks and playing them back to back?

#175107 - wintermute - Thu Sep 09, 2010 5:23 pm

http://forum.gbadev.org/viewtopic.php?p=165340#165340
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

#175109 - kusma - Thu Sep 09, 2010 6:11 pm

wintermute wrote:
http://forum.gbadev.org/viewtopic.php?p=165340#165340

That's when trying to stream through a ring-buffer through looping. If you pre-chunk your data instead, and play back-to-back instead, it should work just fine.

But in either case, I don't see the point to ASM-optimizing an ADPCM routine. I already did a pure C implementation that used about 3% of CPU time on a GBA (at 18khz mono). On the DS with the ARM9 (which is the target in this case), all that CPU time just diminish.

#175115 - wintermute - Thu Sep 09, 2010 9:55 pm

Pretty sure there were glitches attempting to play adpcm chunks back to back but feel free to prove me wrong :p
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

#175117 - DiscoStew - Fri Sep 10, 2010 1:07 am

One reason why my tests show a higher CPU % is partly because it's taking into account the time for copying chunks from a file in the filesystem into main RAM before any decoding is done. There is also the main RAM glitch when reading/writing data non-sequentially, but since I'm not at my home computer, I can't see if my code is affected by it. It's been a long time since I last touched this project.

I did make a post in the Audio forums about possible ways of having the hardware do the decoding by altering the audio stream prior to encoding in a way to have the initial chunks have the same sample/step.
_________________
DS - It's all about DiscoStew