gbadev.org forum archive

After eKid released his ms-adpcm decoder using assembly for fast stream decoding, I wanted to improve my own ima-adpcm decoding by using assembly as well. With his permission, I used the general template from his decoder, made some changes to some structures, and made the decoding code from scratch in assembly, which took some time to do because of my inexperience with dealing with assembly upfront. It all works, but because of my inexperience, I have no idea if there is any room for improving it. So, in order for me to learn, I've posted the decoding core below, and maybe some of you could teach me the ropes.

The interface mimics eKid's own ms-adpcm decoder, but the input data pointer is for my own structure, also listed below. If there are any question as to why I'm doing X and Y, just ask me.

NOTE: For those currently using my C/C++ implemented ima-adpcm decoder, do not switch to using this, because the structuring is different. When improvements are made (if any can be) and code is organized, I will release it.

Code:

typedef struct t_IMA_Adpcm_Data {

int curSamps;
int data1, step1, samp1;
int data2, step2, samp2;
} IMA_Adpcm_Data;

Code:

.global decode_stereo_ima_adpcm

.text
.arm
.align

#define cur    r0
#define da1    r4
#define st1    r5
#define sa1    r6
#define da2    r7
#define st2    r8
#define sa2    r9

#define ta    r10
#define tb    r11
#define tc    r12

#define istab r14

#define src    r1
#define dest r2
#define iter r3

/*
SampXCode = Stream_CurSamples & 0xF;
Stream_CurSamples >>= 4;
Diff = (((( SampXCode & 0x7 ) << 1 ) + 1 ) * STREAM_StepTab[ Stream_StepIndex ]) >> 3;
Stream_StepIndex += STREAM_IndexTab[ SampXCode ];
Stream_LastSample += ( SampXCode & 0x8 ? -Diff : Diff );

if( Stream_StepIndex > 88 )       Stream_StepIndex = 88;
if( Stream_StepIndex < 0 )    Stream_StepIndex = 0;
if( Stream_LastSample > 32767 )    Stream_LastSample = 32767;
if( Stream_LastSample <= -32768 ) Stream_LastSample = -32768;
*target++ = Stream_LastSample;

*/

/*************************************************************
* *
* decode_stereo_ima_adpcm( data, source, target, iterations ) *
* *
*************************************************************/

decode_stereo_ima_adpcm:

push {r0,r4-r11,lr}
ldmia r0, {cur,da1,st1,sa1,da2,st2,sa2}
adr    istab, Index_Step_Table

.examine_cur:
cmp cur, #0
beq .reloadwords
.decode_word_loop:

/*************** Left channel *********************/

ldr    tc, [istab, st1, lsl#2]                // tc = Step_Index_Table[ Step_Index1 ]

and    ta, da1, #15                      // ta = CurSamples1 & 0xF
lsr    da1, da1, #4                      // CurSamples1 >>= 4
and    tb, ta, #7                         // tb = ta & 0x7
cmp    ta, #8                            // Is ta >= 8 (ExamineA)

add    ta, #1                            // ta += 1
ldr    ta, [istab, -ta, lsl#2]                // ta = StepIndex_Table[ -ta ]
add    st1, ta                            // Step_Index1 += ta

lsl    tb, #1                            // Diff = ((( tb << 1 ) + 1) * tc) >> 3
add    tb, #1
smulbb ta, tb, tc
asr    ta, #3

subge sa1, ta                            // Samp1 -= ta if ExamineA is true
addlt sa1, ta                            // Samp1 += ta if ExamineA is false

/*************** Right channel ********************/

ldr    tc, [istab, st2, lsl#2]                // tc = Step_Index_Table[ Step_Index2 ]

and    ta, da2, #15                      // ta = CurSamples2 & 0xF
lsr    da2, da2, #4                      // CurSamples2 >>= 4
and    tb, ta, #7                         // tb = ta & 0x7
cmp    ta, #8                            // Is ta >= 8 (ExamineB)

add    ta, #1                            // ta += 1
ldr    ta, [istab, -ta, lsl#2]                // ta = StepIndex_Table[ -ta ]
add    st2, ta                            // Step_Index2 += ta

lsl    tb, #1                            // Diff = ((( tb << 1 ) + 1) * tc) >> 3
add    tb, #1
smulbb ta, tb, tc
asr    ta, #3

subge sa2, ta                            // Samp2 -= ta if ExamineB is true
addlt sa2, ta                            // Samp2 += ta if ExamineB is false

/************* Clamp Step Indexes *******************/

cmp    st1, #89
cmplt st2, #89
bge    .satihi
1:
cmp    st1, #0
cmpge st2, #0
blt    .satilo

/**************** Clamp Samples *********************/

2:
cmp    sa1, #32768
cmplt sa2, #32768
bge    .sathi
3:
cmp    sa1, #-32768
cmpge sa2, #-32768
blt    .satlo

/*************** Write Samples *********************/

4:
strh sa1, [dest], #2
strh sa2, [dest], #2

sub    cur, #1
subs iter, #1
bne    .examine_cur

pop    {ta}
stmia ta, {cur,da1,st1,sa1,da2,st2,sa2}

pop    {r4-r11,pc}

.reloadwords:
mov    cur, #8
ldr    da1, [src], #4
ldr    da2, [src], #4
b    .decode_word_loop

.satihi:
cmp    st1, #89
ldrge st1, [istab, #-68]
cmp    st2, #89
ldrge st2, [istab, #-68]
b    1b

.satilo:
cmp    st1, #0
ldrlt st1, [istab, #-72]
cmp    st2, #0
ldrlt st2, [istab, #-72]
b    2b

.sathi:
cmp    sa1, #32768
ldrge sa1, [istab, #352]
cmp    sa2, #32768
ldrge sa2, [istab, #352]
b    3b

.satlo:
cmp    sa1, #-32768
ldrlt sa1, [istab, #356]
cmp    sa2, #-32768
ldrlt sa2, [istab, #356]
b    4b

.word 0
.word 88
.word 8, 6, 4, 2, -1, -1, -1, -1
.word 8, 6, 4, 2, -1, -1, -1, -1
Index_Step_Table:
.word 7, 8, 9, 10, 11, 12, 13, 14
.word 16, 17, 19, 21, 23, 25, 28, 31
.word 34, 37, 41, 45, 50, 55, 60, 66
.word 73, 80, 88, 97, 107, 118, 130, 143
.word 157, 173, 190, 209, 230, 253, 279, 307
.word 337, 371, 408, 449, 494, 544, 598, 658
.word 724, 796, 876, 963, 1060, 1166, 1282, 1411
.word 1552, 1707, 1878, 2066, 2272, 2499, 2749, 3024
.word 3327, 3660, 4026, 4428, 4871, 5358, 5894, 6484
.word 7132, 7845, 8630, 9493, 10442, 11487, 12635, 13899
.word 15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794
.word 32767
.word -32768

_________________
DS - It's all about DiscoStew

Coolness :)

DiscoStew wrote:

Code:

ldr ta, [istab, -ta, lsl#2] // ta = StepIndex_Table[ -ta ]
add st1, ta // Step_Index1 += ta

Code:

smulbb ta, tb, tc
asr ta, #3

LDR and SMULxy (and some others) have an 'interlock' cycle if you use the target register right after the instruction. If you rearrange the instructions below to fit between the load/multiply and the use you can save a cycle here and there.

(example)

Code:

ldr ta, [istab, -ta, lsl#2] // ta = StepIndex_Table[ -ta ]
lsl tb, #1 // Diff = ((( tb << 1 ) + 1) * tc) >> 3 (avoid interlock)
add st1, ta // Step_Index1 += ta

When I see this

Code:

if( Stream_StepIndex > 88 ) Stream_StepIndex = 88;
if( Stream_StepIndex < 0 ) Stream_StepIndex = 0;
if( Stream_LastSample > 32767 ) Stream_LastSample = 32767;
if( Stream_LastSample <= -32768 ) Stream_LastSample = -32768;

it reminds me of the ARM9 saturation math functions (used for clamping). I'm at home so I don't have the exact op-codes in front of me, but they would probably be very useful for those types of operations.

I'm not really sure, since I don't code for the DS yet.. but couldn't this be optimized...

Code:

asr ta, #3
subge sa1, ta // Samp1 -= ta if ExamineA is true
addlt sa1, ta // Samp1 += ta if ExamineA is false

; To this...

subge sa1, ta, asr #3
addlt sa1, ta, asr #3

Thanks for telling me about the saturating arithmetic instructions on the ARM9, never would have noticed them otherwise.

Looks like the relevant instruction is SSAT or USAT.
SSAT r0, #16, r1 would take R1, force it to be between -32768 and 32767, then store the result in r0.
USAT r0, #31, r1 take r1, force it to be between 0 and 2^31, etc...

I could see USAT as being used by C compilers to turn a bitwise operator into a "logical" operator which needs to return a 1 or zero.
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."

Documentation I have says that SSAT and USAT are for the ARMv6 and above. But, isn't the ARM9's architecture the ARMv5TE? Guess it doesn't hurt to at least see if it compiles.

EDIT:

Nope, doesn't compile.
_________________
DS - It's all about DiscoStew

I sure could use some saturation instructions. :/

The only saturation instructions I could see that worked on the ARM9 were QADD and QSUB, both of which saturate to signed 32bit integers, but I gave it a shot anyways, having shifted my initial samples from 16bit to 32bit, and just working with them like that. While I got it working, I didn't really see any change to processing time. Then again, I was still working with tables set at 16bit, requiring me to make a few shifts each loop, so if I convert those up to 32bit, I can remove those shifts, and should get some speed. I'll give that a try.

EDIT:

Got QADD and QSUB working "correctly", but now instead of using smul, I ended up using just mul, which is an extra cycle in similar situations if I'm correct. :(

Maybe I can fix that.
_________________
DS - It's all about DiscoStew

Well, with all the suggestions given, and with the use of QADD and QSUB, my routines are roughly twice as fast as current c++ counterparts, both mono and stereo. I'm happy with it for the moment.

If those weren't the saturation instructions you were thinking of Miked0801, then I'd like to know about the ones you were talking about. Would be nice not to shift samples and decoding to 32bit just to make use of the current ones.
_________________
DS - It's all about DiscoStew

Nope, I believe those were the ones. I'll double check on Monday when I'm back in the office and have my trusty ARM ARM manual in hand.

I know this is a quite old topic, but I was curious to know if there were news... it all started because I was searching info on the net about ARM9 and I found this document which -to my weak knowledge- seems related to ARM9.

The fact is that -if I understand correctly what's in the doc- there should be a QADD16 instruction that should be able to perform two saturated addictions on 16 bits operands in the same moment.

Am I wrong? Where I can find good documents about ARM9, then?

Thanks :D

Saturated addictions, aye? ;D

Joking aside:
If I'm reading it correctly, the the halfword extensions to ADD are only available to the ARM6 architecture (IIRC, the ARM9 is of the ARM5E arch), and QADD16 doesn't exist.

EDIT:
And quick note about clamping:

Code:

@ old code
cmp r0, #0 @ if(r0 < 0)
movlt r0, #0 @ r0 = 0

@ new code
bic r0, r0, r0, asr #31 @ clear everything if top bit [neg] is set

And as for clamping to the sample range [if you have enough registers]:

Code:

@ before loop
ldr ip, =0x7FFF @ positive limit

@ during loop
@ old code
cmp r0, #-32768
movlt r0, #-32768
cmp r0, #32767
movgt r0, #32767

@ new code
@ not totally safe, but usually is enough
teq r0, r0, lsl #16 @ signs differ?
submi r0, ip, r0, asr #31 @ yes - clamp

Wow, you were too fast, I just came here to post I realized these instructions comes from version 6...

(is it saturating addictions)?

It's "additions", not "addictions" ;D

The DS ARM9 is an ARM946E-S which does have QADD/QSUB instructions. See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.set.arm9/index.html

The Parallel QADD8/16 instructions aren't available though. http://www.keil.com/support/man/docs/armasm/armasm_cjafgdih.htm
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

Ruben wrote:

It's "additions", not "addictions" ;D

LOL! Now I see... I wish I had studied English better when I was at school... now it's quite late ;)

Since you're using QADD, this means you're doing this for the DS. Why not just use the hardware-support for ADPCM?

You can't use the hardware to stream ADPCM
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

wintermute wrote:

You can't use the hardware to stream ADPCM

Why not? How is streaming any different from just breaking things into chunks and playing them back to back?

http://forum.gbadev.org/viewtopic.php?p=165340#165340
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

wintermute wrote:

http://forum.gbadev.org/viewtopic.php?p=165340#165340

That's when trying to stream through a ring-buffer through looping. If you pre-chunk your data instead, and play back-to-back instead, it should work just fine.

But in either case, I don't see the point to ASM-optimizing an ADPCM routine. I already did a pure C implementation that used about 3% of CPU time on a GBA (at 18khz mono). On the DS with the ARM9 (which is the target in this case), all that CPU time just diminish.

Pretty sure there were glitches attempting to play adpcm chunks back to back but feel free to prove me wrong :p
_________________
devkitPro - professional toolchains at amateur prices
devkitPro IRC support
Personal Blog

One reason why my tests show a higher CPU % is partly because it's taking into account the time for copying chunks from a file in the filesystem into main RAM before any decoding is done. There is also the main RAM glitch when reading/writing data non-sequentially, but since I'm not at my home computer, I can't see if my code is affected by it. It's been a long time since I last touched this project.

I did make a post in the Audio forums about possible ways of having the hardware do the decoding by altering the audio stream prior to encoding in a way to have the initial chunks have the same sample/step.
_________________
DS - It's all about DiscoStew

gbadev.org forum archive

ASM > Looking for some optimization tips (ima-adpcm)

#167050 - DiscoStew - Fri Feb 27, 2009 8:23 pm

#167055 - eKid - Sat Feb 28, 2009 12:10 am

#167070 - Miked0801 - Sat Feb 28, 2009 5:32 am

#167072 - Ruben - Sat Feb 28, 2009 6:02 am

#167073 - Dwedit - Sat Feb 28, 2009 6:11 am

#167074 - DiscoStew - Sat Feb 28, 2009 6:45 am

#167080 - eKid - Sat Feb 28, 2009 12:29 pm

#167094 - DiscoStew - Sat Feb 28, 2009 8:28 pm

#167103 - DiscoStew - Sun Mar 01, 2009 7:12 am

#167108 - Miked0801 - Sun Mar 01, 2009 4:49 pm

#175095 - sverx - Wed Sep 08, 2010 3:00 pm

#175096 - Ruben - Wed Sep 08, 2010 3:09 pm

#175097 - sverx - Wed Sep 08, 2010 3:28 pm

#175098 - Ruben - Wed Sep 08, 2010 3:53 pm

#175101 - wintermute - Thu Sep 09, 2010 2:05 am

#175103 - sverx - Thu Sep 09, 2010 2:30 pm

#175104 - kusma - Thu Sep 09, 2010 3:54 pm

#175105 - wintermute - Thu Sep 09, 2010 4:30 pm

#175106 - kusma - Thu Sep 09, 2010 4:37 pm

#175107 - wintermute - Thu Sep 09, 2010 5:23 pm

#175109 - kusma - Thu Sep 09, 2010 6:11 pm

#175115 - wintermute - Thu Sep 09, 2010 9:55 pm

#175117 - DiscoStew - Fri Sep 10, 2010 1:07 am