gbadev.org forum archive

This is a read-only mirror of the content originally found on forum.gbadev.org (now offline), salvaged from Wayback machine copies. A new forum can be found here.

ASM > AdLib emulation optimization help

#170293 - Pate - Wed Sep 16, 2009 6:17 am

Hi!

I am working on an AdLib emulation running on DS ARM7. It currently can manage 8 channels (each with 2 operators), but the ninth (and final) channel makes it take more CPU cycles than are available. I was hoping you gurus could perhaps take a look at my inner loops and check if I have missed some obvious optimization possibilities.

I am planning to change the rows of ldr and str commands to ldmia and stmia after I reorder the slot struct variables and register usage, but that is only done 9 times per frame while the inner loops are done 256*9 times a frame so any optimization in the for_SLOT1 and for_SLOT2 loops would be much appreciated!

Thanks in advance!

Code:

.for_channel:
    @-------
    @ Load the SLOT-specific data values
    @-------
    ldrb   r1, [r0, #(ch0_slot1_bits-SLOT1)]        @ r1 = SLOT1 bits (Feedback(3 bits), Con, AM, Vib, EG type, KSR)
    ldr    r2, [r0, #(ch0_slot1_wavetable-SLOT1)]   @ r2 = SLOT1 wavetable value
    ldr    r3, [r0, #(ch0_slot1_op1_out-SLOT1)]     @ r3 = SLOT1 op1_out value
    ldr    r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r5 = SLOT1 sustain level (MAX_ATT_INDEX if release phase)
    ldr    r5, [r0, #(ch0_slot1_envelope-SLOT1)]    @ r5 = SLOT1 envelope counter value
    ldr    r6, [r0, #(ch0_slot1_Incr-SLOT1)]        @ r6 = SLOT1 Incr value
    ldr    r7, [r0, #(ch0_slot1_Cnt-SLOT1)]         @ r7 = SLOT1 Cnt value
    ldr    r8, [r0, #(ch0_slot1_volume-SLOT1)]      @ r8 = SLOT1 volume value << 16
    ldr    r9, [r0, #(ch0_slot1_TLL-SLOT1)]         @ r9 = SLOT1 TLL value
    ldr    r10, =sin_tab
    ldr    r11, =tl_tab
    add    r10, r2                                  @ r10 = sin_tab + SLOT1->wavetable
    @-------
    @    tmp = lfo_am_table[ OPL->lfo_am_cnt >> LFO_SH ];
    @    if (OPL->lfo_am_depth)
    @        LFO_AM = tmp;
    @    else
    @        LFO_AM = tmp>>2;
    @-------

    bic    r12, #0xFF00
    orr    r12, r1, lsl #8                          @ Put all bit values of SLOT1 into r12 second byte
    and    r1, r12, #0xFF                           @ r1 = LFO_AM and lfo_am_depth
   
    mov    r2,  #3
    tst    r1,  #1                                  @ if (OPL->lfo_am_depth)
    moveq  r2,  #5
    tst    r12, #(8<<8)                             @ AM bit set?
    addne  r9,  r1, lsr r2                          @ Now r9 = SLOT1 volume base (0..511?)
    orr    r12, #((ADLIB_BUFFER_SAMPLES-1)<<20)
.for_SLOT1:                                         @ for( i=length-1; i >= 0 ; i-- ) {
    @-------
    @ Calculate the FM part for SLOT 1
    @ Output goes always to the output buffer, SLOT2 uses it as phase_modulation or direct output depending on connect
    @
    @    FREQ_SH = 16
    @    FREQ_MASK = 65535
    @    SIN_MASK = 1023
    @    ENV_QUIET = 384 (= 6144>>4)
    @
    @    out  = SLOT->op1_out[0] + SLOT->op1_out[1];
    @    SLOT->op1_out[0] = SLOT->op1_out[1];
    @    SLOT->op1_out[1] = 0;
    @    *SLOT->connect1 += SLOT->op1_out[0];
    @    env  = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask));
    @    if( env < ENV_QUIET )
    @    {
    @        if (!SLOT->FB)
    @            out = 0;
    @        UINT32 p = (env<<4) + sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (out<<SLOT->FB))) >> FREQ_SH ) & SIN_MASK) ];
    @        if (p < TL_TAB_LEN)
    @            SLOT->op1_out[1] = tl_tab[p];
    @    }
    @-------
    strh   r3, [lr], #2                             @ *SLOT->connect1 += SLOT->op1_out[0];
    add    r2, r9, r8, lsr #16                      @ r2 = env = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask));
    tst    r12, #(7<<(8+5))                         @ Feedback = 0?
    moveq  r1, r7, lsr #16                          @ Yes, use only SLOT->Cnt
    beq    .op1_no_feedback
        @-------
        @ Feedback active, calculate r1 = ((signed int)((SLOT->Cnt & ~FREQ_MASK) + (out<<SLOT->FB)))
        @-------
        mov    r1, r12, lsr #(8+5)                  @ r1 = feedback value, 1..7, 7 = smallest feedback, 1 = largest
        and    r1, #7
        add    r1, r7, r3, asr r1                   @ r1 = SLOT1->Cnt + (out<<SLOT->FB)
        lsr    r1, #16                              @ r1 >>= FREQ_SH
.op1_no_feedback:
    bic    r1, #0xFC00                              @ r1 &= SIN_MASK
    lsl    r3, #16                                  @ r3 = SLOT->op1_out[0] = SLOT->op1_out[1]; SLOT->op1_out[1] = 0;
    ldr    r1, [r10, r1, lsl #2]                    @ r1 = sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (out<<SLOT->FB))) >> FREQ_SH ) & SIN_MASK) ];
    add    r1, r2, lsl #5                           @ r1 = env<<4 + sin_tab[..], extra << 1 for halfword accessing
    cmp    r1, #(2*TL_TAB_LEN)                      @ if (p < TL_TAB_LEN)
        ldrloh r1, [r11, r1]
        orrlo  r3, r1                               @ SLOT->op1_out[1] = tl_tab[p];
    @-------
    @ Calculate envelope for SLOT 1
    @-------
    tst    r5, #1                                   @ Are we in ATTACK phase?
    bne    op1_attack                               @ Yes, go handle ATTACK phase volume envelope
    add    r8, r5, lsr #1                           @ Decrease the volume by the envelope counter
    cmp    r8, r4                                   @ Did we go under the SUSTAIN level?
    bhi    op1_sustain                              @ Yep, go adjust the volume
op1_env_done:   
    @-------
    @ Calculate phase generator values for SLOT 1
    @
    @    /* Phase Generator */
    @    if(op->vib)
    @    {
    @        unsigned int block_fnum = CH->block_fnum;
    @        unsigned int fnum_lfo   = (block_fnum&0x0380) >> 7;
    @        signed int lfo_fn_table_index_offset = lfo_pm_table[LFO_PM + 16*fnum_lfo ];
    @        if (lfo_fn_table_index_offset)    /* LFO phase modulation active */
    @        {
    @            block_fnum += lfo_fn_table_index_offset;
    @            UINT8 block = (block_fnum&0x1c00) >> 10;
    @            op->Cnt += (OPL->fn_tab[block_fnum&0x03ff] >> (7-block)) * op->mul;
    @        }
    @        else    /* LFO phase modulation  = zero */
    @            op->Cnt += op->Incr;
    @    }
    @    else    /* LFO phase modulation disabled for this operator */
    @        op->Cnt += op->Incr;
    @-------
    add    r7, r6                                   @ SLOT1->Cnt += SLOT1->Incr
    @-------
    @ Loop to next sample
    @-------
    subs   r12, #(1<<20)
    bpl    .for_SLOT1                               @ }
    add    r12, #(1<<20)
    @-------
    @ Save the final values for SLOT1
    @-------
    str    r3, [r0, #(ch0_slot1_op1_out-SLOT1)]     @ r3 = SLOT1 op1_out value
    str    r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r4 = SLOT1 sustain level (MAX_ATT_INDEX if release phase)
    str    r5, [r0, #(ch0_slot1_envelope-SLOT1)]    @ r5 = SLOT1 envelope counter & mode value
    str    r7, [r0, #(ch0_slot1_Cnt-SLOT1)]         @ r7 = SLOT1 Cnt value
    str    r8, [r0, #(ch0_slot1_volume-SLOT1)]      @ r8 = SLOT1 volume value << 16
   
    @-------
    @ Calculate all the 256 samples for SLOT 2
    @-------
    add    r0, #SLOT_SIZE
    sub    lr, #ADLIB_BUFFER_SIZE                   @ Rewind buffer pointer back to start
   
    ldrb   r1, [r0, #(ch0_slot1_bits-SLOT1)]        @ r1 = SLOT2 bits (Feedback, Con, AM, Vib, EG type, KSR)
    ldr    r2, [r0, #(ch0_slot1_wavetable-SLOT1)]   @ r2 = SLOT1 wavetable value
    ldr    r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r5 = SLOT1 sustain level (MAX_ATT_INDEX if release phase)
    ldr    r5, [r0, #(ch0_slot1_envelope-SLOT1)]    @ r5 = SLOT1 envelope value
    ldr    r6, [r0, #(ch0_slot1_Incr-SLOT1)]        @ r6 = SLOT1 Incr value
    ldr    r7, [r0, #(ch0_slot1_Cnt-SLOT1)]         @ r7 = SLOT1 Cnt value
    ldr    r8, [r0, #(ch0_slot1_volume-SLOT1)]      @ r8 = SLOT1 volume value << 16
    ldr    r9, [r0, #(ch0_slot1_TLL-SLOT1)]         @ r9 = SLOT1 TLL value
    ldr    r10, =sin_tab
   
    and    r3, r12, #0xFF                           @ r3 = LFO_AM and lfo_am_depth
    bic    r12, #0xFF00
   
    add    r10, r2                                  @ r10 = sin_tab + SLOT1->wavetable

    @-------
    @    tmp = lfo_am_table[ OPL->lfo_am_cnt >> LFO_SH ];
    @    if (OPL->lfo_am_depth)
    @        LFO_AM = tmp;
    @    else
    @        LFO_AM = tmp>>2;
    @-------
    mov    r2,  #3
    tst    r3,  #1                                  @ if (OPL->lfo_am_depth)
    moveq  r2,  #5
    tst    r1,  #8                                  @ AM bit set?
    addne  r9,  r3, lsr r2                          @ Now r9 = SLOT2 volume base (0..511?)
   
    orr    r12, r1, lsl #8                          @ Put all bit values of SLOT2 into r12 second byte
   
    orr    r12, #((ADLIB_BUFFER_SAMPLES-1)<<20)
.for_SLOT2:                                         @ for( i=length-1; i >= 0 ; i-- ) {
    @-------
    @ Calculate the FM part for SLOT 2
    @
    @     env = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask));
    @    if( env < ENV_QUIET )
    @    {
    @        UINT32 p = (env<<4) + sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (phase_modulation<<16))) >> FREQ_SH ) & SIN_MASK) ];
    @        if (p >= TL_TAB_LEN)
    @            output[0] += 0;
    @        else
    @            output[0] += tl_tab[p];
    @    }
    @-------
    ldrh   r3, [lr]                                 @ r3 = either phase_modulation or output[0]
    add    r2, r9, r8, lsr #16                      @ r2 = env = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask));
    tst    r12, #(1<<(8+4))                         @ If Con=1, op1 produces sound directly, else use it as phase modulation
    addeq  r1, r7, r3, lsl #16                      @ r1 = ((SLOT->Cnt) + (phase_modulation<<16))
    moveq  r3, #0
    movne  r1, r7
    lsr    r1, #16                                  @ r1 >>= FREQ_SH
    bic    r1, #0xFC00                              @ r1 &= SIN_MASK
    ldr    r1, [r10, r1, lsl #2]                    @ r1 = sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (phase_modulation<<16))) >> FREQ_SH ) & SIN_MASK) ];
    add    r1, r2, lsl #5                           @ r1 = env<<4 + sin_tab[..], extra << 1 for halfword accessing
    cmp    r1, #(2*TL_TAB_LEN)                      @ if (p < TL_TAB_LEN)
        ldrloh r1, [r11, r1]
        addlo  r3, r1                               @    output[0] += tl_tab[p];

    @-------
    @ Store the sample to output buffer
    @
    @    lt = output[0];
    @    lt >>= FINAL_SH;
    @    /* limit check */
    @    lt = limit( lt , MAXOUT, MINOUT );
    @    /* store to sound buffer */
    @    buf[i] = lt;
    @-------
    strh   r3, [lr], #2                             @ buf[i] = output[0];

    @-------
    @ Calculate envelope for SLOT 2
    @-------
    tst    r5, #1                                   @ Are we in ATTACK phase?
    bne    op2_attack                               @ Yes, go handle ATTACK phase volume envelope
    add    r8, r5, lsr #1                           @ Decrease the volume by the envelope counter
    cmp    r8, r4                                   @ Did we go under the SUSTAIN level?
    bhi    op2_sustain                              @ Yep, go adjust the volume
op2_env_done:   
    @-------
    @ Calculate phase generator values for SLOT 2
    @
    @    /* Phase Generator */
    @    if(op->vib)
    @    {
    @        unsigned int block_fnum = CH->block_fnum;
    @        unsigned int fnum_lfo   = (block_fnum&0x0380) >> 7;
    @        signed int lfo_fn_table_index_offset = lfo_pm_table[LFO_PM + 16*fnum_lfo ];
    @        if (lfo_fn_table_index_offset)    /* LFO phase modulation active */
    @        {
    @            block_fnum += lfo_fn_table_index_offset;
    @            UINT8 block = (block_fnum&0x1c00) >> 10;
    @            op->Cnt += (OPL->fn_tab[block_fnum&0x03ff] >> (7-block)) * op->mul;
    @        }
    @        else    /* LFO phase modulation  = zero */
    @            op->Cnt += op->Incr;
    @    }
    @    else    /* LFO phase modulation disabled for this operator */
    @        op->Cnt += op->Incr;
    @-------
    add    r7, r6                                   @ SLOT->Cnt += SLOT->Incr
    @-------
    @ Loop to next sample
    @-------
    subs   r12, #(1<<20)
    bpl    .for_SLOT2                               @ }
    add    r12, #(1<<20)
    @-------
    @ Save the final values for SLOT2
    @-------
    str    r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r4 = SLOT1 sustain level (MAX_ATT_INDEX if release phase)
    str    r5, [r0, #(ch0_slot1_envelope-SLOT1)]    @ r5 = SLOT1 envelope value
    str    r7, [r0, #(ch0_slot1_Cnt-SLOT1)]         @ r7 = SLOT2 Cnt value
    str    r8, [r0, #(ch0_slot1_volume-SLOT1)]      @ r8 = SLOT2 volume value << 16

    @-------
    @ Go handle the next channel unless this was already the last channel.
    @-------
    add    r0, #SLOT_SIZE
    add    lr, #(ADLIB_BUFFER_SIZE)
    add    r12, #0x00010000                         @ channel++
    and    r4, r12, #0x000F0000
    tst    r12, #4                                  @ Do we have rhythm mode on?
    moveq  r5, #0x00080000                          @ Nope, handle 9 melodic channels
    movne  r5, #0x00060000                          @ Yep, handle 6 melodic channels
    cmp    r4, r5
    blt    .for_channel


Pate
_________________

#170295 - Ruben - Wed Sep 16, 2009 6:54 am

I'm not sure if I'm reading this correctly (as I don't understand what you mean by 'slot') but.. are you calculating the envelopes *during* mixing? If so, that will bloat your code a lot: calculate the final scaling value *before* entering the mix loop.

Also, I see you using a lot of "tst/cmp" inside the mixing loop. If the values in the registers are constant, then I would suggest making separate loops for each condition, to avoid testing during the mixing.

Also, conditionals are t3h r0ckz0rrz ^_^'
Code:
@ Old code
.for_SLOT1:
    strh   r3, [lr], #2
    add    r2, r9, r8, lsr #16
    tst    r12, #(7<<(8+5))
    moveq  r1, r7, lsr #16
    beq    .op1_no_feedback
        mov    r1, r12, lsr #(8+5)
        and    r1, #7
        add    r1, r7, r3, asr r1
        lsr    r1, #16
.op1_no_feedback:

@ New code

.for_SLOT1:
    strh   r3, [lr], #2
    add    r2, r9, r8, lsr #16
    tst    r12, #(7<<(8+5))
    moveq  r1, r7, lsr #16
        movne  r1, r12, lsr #(8+5)
        andne  r1, #7
        addne  r1, r7, r3, asr r1
        lsrne  r1, #16

#170296 - Pate - Wed Sep 16, 2009 7:09 am

Ruben wrote:
I'm not sure if I'm reading this correctly (as I don't understand what you mean by 'slot') but.. are you calculating the envelopes *during* mixing?


Well, the envelope can change during mixing (like going from attack to decay and then to sustain can all happen within 256 samples), but I see what you mean. I could probably handle each stage separately, if needed.

Quote:
Also, I see you using a lot of "tst/cmp" inside the mixing loop. If the values in the registers are constant, then I would suggest making separate loops for each condition, to avoid testing during the mixing.


Ah, of course! Why didn't I think of that.. I most likely need to test the volume for each sample, but the feedback level and connection mode will stay constant, so having 4 different loops (one for each case) will most likely shave off a lot of cycles! Many many thanks for this tip!

Quote:

Also, conditionals are t3h r0ckz0rrz ^_^'


Really? I mean, even 4 commands is faster to do using conditional execution rather than a branch? I thought the limit was somewhere around 2-3.. What is the number for switching to a branch instead?

Much thanks again! I feel confident that even that single change will make my code handle all 9 channels without problems. Looking forward to getting home from work so I can start coding it! :-)

Pate
_________________

#170297 - Ruben - Wed Sep 16, 2009 7:13 am

Well, a branch is 3 cycles. And you've got "tst, moveq, beq" followed by 4 instructions of the opposite condition. If the condition was 0, then it would take 4 cycles + 1 for the tst, and 6 cycles + 1 for the tst in the other case. Without the branch it would be 5 cycles + 1 for the tst, which is in between, so it's a nice eq/ne trade-off.

EDIT:

On further inspection:

After the tst, you've got 4 cycles if it was 0. If it was not 0, then you've got 5 cycles. In short, you can keep the beq to make it faster if it was 0, or keep the conditionals to 'level out' the speed.

EDIT 2:

On further inspection again...

After the test, if the condition was 0, it would take 4 cycles. If it wasn't 0, then you've got *six* cycles, so yes, I would say to get rid of the branch to have a trade-off.


Last edited by Ruben on Thu Sep 17, 2009 7:14 am; edited 1 time in total

#170305 - Miked0801 - Wed Sep 16, 2009 2:46 pm

I see quite a few mov / tst / lsr type ops. These almost always can be incorporated into arithmetic ops and removed. Also, you are working with halfwords - anyway to up that to 32-bits and better take advantage of your CPU (and probably bus size)?

#170312 - Pate - Wed Sep 16, 2009 3:57 pm

Miked0801 wrote:
I see quite a few mov / tst / lsr type ops. These almost always can be incorporated into arithmetic ops and removed.

Hmm.. Can you give an example? I'm just learning ARM ASM, so all tricks are appreciated!

Quote:
Also, you are working with halfwords - anyway to up that to 32-bits and better take advantage of your CPU (and probably bus size)?


Like doing 2 samples at a time? That might be worth trying. I tried to fit everything I need in the inner loops into registers, but I would run out of registers when trying to do two samples at the same time.

Btw, I managed now to have all 9 channels running, after doing the separate loops as suggested by Ruben, and unrolling the inner loops once (so I only test for buffer end once every two samples). I could probably also safely skip the envelope checks and new volume calculation for every second sample.

Anyways, it is starting to look like this might actually work, thanks again for your help!

Pate
_________________

#170314 - Ruben - Wed Sep 16, 2009 4:14 pm

Quote:
Hmm.. Can you give an example? I'm just learning ARM ASM, so all tricks are appreciated!

Most of the stuff on there is pretty well optimized in this sense, but he means something like
Code:
@ Eep, slow
mov r0, r1, asr #5
add r3, r0, r3

@ Yay, one opcode faster ^_^'
add r3, r3, r1, asr #5

Quote:
Like doing 2 samples at a time?

Probably. Depends on if you're using stereo or not. If you're not using stereo, then yes, 2 samples at once, thereby avoiding using 2 costly stores and replace it with 1.
Quote:
I could probably also safely skip the envelope checks and new volume calculation for every second sample.
You probably could and if you can, you should: if you have played any Japan-originating GBA game that use the 'Sappy' engine, pay attention to the envelopes: these are only updated once per *frame* and it's barely noticeable, so I think you can 'afford' to do this outside of the mixing loop.

#170335 - Pate - Thu Sep 17, 2009 5:10 am

Ruben wrote:
Quote:
Like doing 2 samples at a time?

Probably. Depends on if you're using stereo or not. If you're not using stereo, then yes, 2 samples at once, thereby avoiding using 2 costly stores and replace it with 1.


Yeah, I'm using mono (stereo in the AdLib/SoundBlaster world means having 2 OPL2 chips, each with their own 9 channels/18 operators, so that is pretty much out of reach with ARM7). And, I am already keeping two samples for slot1 in r3 register, where the high halfword is the previous sample and low halfword is the current sample. I'll just need to swap the meaning of those and store the full word, should be a straightforward change. I'll see if something similar could be done with slot2.

Quote:
You probably could and if you can, you should: if you have played any Japan-originating GBA game that use the 'Sappy' engine, pay attention to the envelopes: these are only updated once per *frame* and it's barely noticeable, so I think you can 'afford' to do this outside of the mixing loop.


Yeah, I'll have to see what effect that has. In the original code from DosBox that I am using as an example, the attack envelope does a table lookup and a MULTIPLY operation for each sample!

Edit: Actually, the DosBox emulator does a table lookup and multiply at 50000Hz per slot, so at 16384Hz I should do that 3 times per sample to make my code work exactly like the DosBox one. I don't think I will, though. :-)

Thanks for your tips again, it is very useful to hear other people's optimization ideas, you become so blind to problems in your own code.

Pate
_________________

#170352 - Pate - Fri Sep 18, 2009 4:57 am

I worked on the code again a bit yesterday, looks like handling two samples at a time is pretty easy for both slots. Thanks again for the idea!

When I want to replace the low halfword of register r3 with the value in r1, is this the optimal code?
Code:

lsr r3, #16
orr r3, r1, r3, lsl #16


How about when I need to add a halfword to the low halfword, without affecting the high halfword. Is this the best that can be done?
Code:

ror r3, #16
add r3, r1, lsl #16
ror r3, #16


The latter does not look very efficient, but I couldn't figure out a better way to do it...

Thanks!

Pate
_________________

#170354 - Ruben - Fri Sep 18, 2009 4:03 pm

Code:
lsr r3, #16
orr r3, r1, r3, lsl #16

Yes, I think that's the best you can get it to.

Code:
ror r3, #16
add r3, r1, lsl #16
ror r3, #16

Let's look at that again...
Code:
ror r3, #16         @ DDDDxxxx
add r3, r1, lsl #16 @ DDDD += r1
ror r3, #16         @ xxxxDDDD

@ Basically, you want to add a value into
@ DDDD, without affecting the upper hword.
@ If you're going to be doing this twice,
@ you can do this...

@ Sample 1: r3 swapped to DDDDxxxx
mov r1, lsl #16
add r3, r1, r3, ror #16

@ Sample 2: r3 swapped back to xxxxDDDD
mov r1, lsl #16
add r3, r1, r3, ror #16

#170392 - Pate - Mon Sep 21, 2009 4:58 am

Okay, I guess it's time for an update...

First off, I only noticed last weekend that all the problems I was having were caused by my playing the SAME buffer that I wrote into, not the other buffer! Argh.. So much for that being the easy part. :-)

What threw me off was that for some peculiar reason No$GBA sounds better when writing to the wrong buffer! The real hardware warbled horribly while No$GBA sounded reasonably clean with only a few clicks now and then (which I then assumed was caused by the CPU lagging behind the buffer fill). Last weekend I finally noticed that something was badly wrong when even only 2 channels caused similar problems.

I then finally after many hours of head scratching found that I had the buffers wrong, and when I switched those, real hardware suddenly sounded completely clean, while No$GBA began warbling. Strange...

Anyways, I then added some checks for CPU load and noticed that at 16kHz my code only took 20% of ARM7 power to handle all 9 channels. So, I immediately upped the mixing speed to 32kHz, and now the code takes around 40% CPU, which I think is fine as I want the ARM7 to do other things besides the AdLib emulation as well.

I still have various problems in the code that I need to fix, but looks like I have a reasonable speed margin now to add the missing features.

Thanks again for all your tips!

Pate
_________________