#170293 - Pate - Wed Sep 16, 2009 6:17 am
Hi!
I am working on an AdLib emulation running on DS ARM7. It currently can manage 8 channels (each with 2 operators), but the ninth (and final) channel makes it take more CPU cycles than are available. I was hoping you gurus could perhaps take a look at my inner loops and check if I have missed some obvious optimization possibilities.
I am planning to change the rows of ldr and str commands to ldmia and stmia after I reorder the slot struct variables and register usage, but that is only done 9 times per frame while the inner loops are done 256*9 times a frame so any optimization in the for_SLOT1 and for_SLOT2 loops would be much appreciated!
Thanks in advance!
Pate
_________________
I am working on an AdLib emulation running on DS ARM7. It currently can manage 8 channels (each with 2 operators), but the ninth (and final) channel makes it take more CPU cycles than are available. I was hoping you gurus could perhaps take a look at my inner loops and check if I have missed some obvious optimization possibilities.
I am planning to change the rows of ldr and str commands to ldmia and stmia after I reorder the slot struct variables and register usage, but that is only done 9 times per frame while the inner loops are done 256*9 times a frame so any optimization in the for_SLOT1 and for_SLOT2 loops would be much appreciated!
Thanks in advance!
Code: |
.for_channel: @------- @ Load the SLOT-specific data values @------- ldrb r1, [r0, #(ch0_slot1_bits-SLOT1)] @ r1 = SLOT1 bits (Feedback(3 bits), Con, AM, Vib, EG type, KSR) ldr r2, [r0, #(ch0_slot1_wavetable-SLOT1)] @ r2 = SLOT1 wavetable value ldr r3, [r0, #(ch0_slot1_op1_out-SLOT1)] @ r3 = SLOT1 op1_out value ldr r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r5 = SLOT1 sustain level (MAX_ATT_INDEX if release phase) ldr r5, [r0, #(ch0_slot1_envelope-SLOT1)] @ r5 = SLOT1 envelope counter value ldr r6, [r0, #(ch0_slot1_Incr-SLOT1)] @ r6 = SLOT1 Incr value ldr r7, [r0, #(ch0_slot1_Cnt-SLOT1)] @ r7 = SLOT1 Cnt value ldr r8, [r0, #(ch0_slot1_volume-SLOT1)] @ r8 = SLOT1 volume value << 16 ldr r9, [r0, #(ch0_slot1_TLL-SLOT1)] @ r9 = SLOT1 TLL value ldr r10, =sin_tab ldr r11, =tl_tab add r10, r2 @ r10 = sin_tab + SLOT1->wavetable @------- @ tmp = lfo_am_table[ OPL->lfo_am_cnt >> LFO_SH ]; @ if (OPL->lfo_am_depth) @ LFO_AM = tmp; @ else @ LFO_AM = tmp>>2; @------- bic r12, #0xFF00 orr r12, r1, lsl #8 @ Put all bit values of SLOT1 into r12 second byte and r1, r12, #0xFF @ r1 = LFO_AM and lfo_am_depth mov r2, #3 tst r1, #1 @ if (OPL->lfo_am_depth) moveq r2, #5 tst r12, #(8<<8) @ AM bit set? addne r9, r1, lsr r2 @ Now r9 = SLOT1 volume base (0..511?) orr r12, #((ADLIB_BUFFER_SAMPLES-1)<<20) .for_SLOT1: @ for( i=length-1; i >= 0 ; i-- ) { @------- @ Calculate the FM part for SLOT 1 @ Output goes always to the output buffer, SLOT2 uses it as phase_modulation or direct output depending on connect @ @ FREQ_SH = 16 @ FREQ_MASK = 65535 @ SIN_MASK = 1023 @ ENV_QUIET = 384 (= 6144>>4) @ @ out = SLOT->op1_out[0] + SLOT->op1_out[1]; @ SLOT->op1_out[0] = SLOT->op1_out[1]; @ SLOT->op1_out[1] = 0; @ *SLOT->connect1 += SLOT->op1_out[0]; @ env = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask)); @ if( env < ENV_QUIET ) @ { @ if (!SLOT->FB) @ out = 0; @ UINT32 p = (env<<4) + sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (out<<SLOT->FB))) >> FREQ_SH ) & SIN_MASK) ]; @ if (p < TL_TAB_LEN) @ SLOT->op1_out[1] = tl_tab[p]; @ } @------- strh r3, [lr], #2 @ *SLOT->connect1 += SLOT->op1_out[0]; add r2, r9, r8, lsr #16 @ r2 = env = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask)); tst r12, #(7<<(8+5)) @ Feedback = 0? moveq r1, r7, lsr #16 @ Yes, use only SLOT->Cnt beq .op1_no_feedback @------- @ Feedback active, calculate r1 = ((signed int)((SLOT->Cnt & ~FREQ_MASK) + (out<<SLOT->FB))) @------- mov r1, r12, lsr #(8+5) @ r1 = feedback value, 1..7, 7 = smallest feedback, 1 = largest and r1, #7 add r1, r7, r3, asr r1 @ r1 = SLOT1->Cnt + (out<<SLOT->FB) lsr r1, #16 @ r1 >>= FREQ_SH .op1_no_feedback: bic r1, #0xFC00 @ r1 &= SIN_MASK lsl r3, #16 @ r3 = SLOT->op1_out[0] = SLOT->op1_out[1]; SLOT->op1_out[1] = 0; ldr r1, [r10, r1, lsl #2] @ r1 = sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (out<<SLOT->FB))) >> FREQ_SH ) & SIN_MASK) ]; add r1, r2, lsl #5 @ r1 = env<<4 + sin_tab[..], extra << 1 for halfword accessing cmp r1, #(2*TL_TAB_LEN) @ if (p < TL_TAB_LEN) ldrloh r1, [r11, r1] orrlo r3, r1 @ SLOT->op1_out[1] = tl_tab[p]; @------- @ Calculate envelope for SLOT 1 @------- tst r5, #1 @ Are we in ATTACK phase? bne op1_attack @ Yes, go handle ATTACK phase volume envelope add r8, r5, lsr #1 @ Decrease the volume by the envelope counter cmp r8, r4 @ Did we go under the SUSTAIN level? bhi op1_sustain @ Yep, go adjust the volume op1_env_done: @------- @ Calculate phase generator values for SLOT 1 @ @ /* Phase Generator */ @ if(op->vib) @ { @ unsigned int block_fnum = CH->block_fnum; @ unsigned int fnum_lfo = (block_fnum&0x0380) >> 7; @ signed int lfo_fn_table_index_offset = lfo_pm_table[LFO_PM + 16*fnum_lfo ]; @ if (lfo_fn_table_index_offset) /* LFO phase modulation active */ @ { @ block_fnum += lfo_fn_table_index_offset; @ UINT8 block = (block_fnum&0x1c00) >> 10; @ op->Cnt += (OPL->fn_tab[block_fnum&0x03ff] >> (7-block)) * op->mul; @ } @ else /* LFO phase modulation = zero */ @ op->Cnt += op->Incr; @ } @ else /* LFO phase modulation disabled for this operator */ @ op->Cnt += op->Incr; @------- add r7, r6 @ SLOT1->Cnt += SLOT1->Incr @------- @ Loop to next sample @------- subs r12, #(1<<20) bpl .for_SLOT1 @ } add r12, #(1<<20) @------- @ Save the final values for SLOT1 @------- str r3, [r0, #(ch0_slot1_op1_out-SLOT1)] @ r3 = SLOT1 op1_out value str r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r4 = SLOT1 sustain level (MAX_ATT_INDEX if release phase) str r5, [r0, #(ch0_slot1_envelope-SLOT1)] @ r5 = SLOT1 envelope counter & mode value str r7, [r0, #(ch0_slot1_Cnt-SLOT1)] @ r7 = SLOT1 Cnt value str r8, [r0, #(ch0_slot1_volume-SLOT1)] @ r8 = SLOT1 volume value << 16 @------- @ Calculate all the 256 samples for SLOT 2 @------- add r0, #SLOT_SIZE sub lr, #ADLIB_BUFFER_SIZE @ Rewind buffer pointer back to start ldrb r1, [r0, #(ch0_slot1_bits-SLOT1)] @ r1 = SLOT2 bits (Feedback, Con, AM, Vib, EG type, KSR) ldr r2, [r0, #(ch0_slot1_wavetable-SLOT1)] @ r2 = SLOT1 wavetable value ldr r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r5 = SLOT1 sustain level (MAX_ATT_INDEX if release phase) ldr r5, [r0, #(ch0_slot1_envelope-SLOT1)] @ r5 = SLOT1 envelope value ldr r6, [r0, #(ch0_slot1_Incr-SLOT1)] @ r6 = SLOT1 Incr value ldr r7, [r0, #(ch0_slot1_Cnt-SLOT1)] @ r7 = SLOT1 Cnt value ldr r8, [r0, #(ch0_slot1_volume-SLOT1)] @ r8 = SLOT1 volume value << 16 ldr r9, [r0, #(ch0_slot1_TLL-SLOT1)] @ r9 = SLOT1 TLL value ldr r10, =sin_tab and r3, r12, #0xFF @ r3 = LFO_AM and lfo_am_depth bic r12, #0xFF00 add r10, r2 @ r10 = sin_tab + SLOT1->wavetable @------- @ tmp = lfo_am_table[ OPL->lfo_am_cnt >> LFO_SH ]; @ if (OPL->lfo_am_depth) @ LFO_AM = tmp; @ else @ LFO_AM = tmp>>2; @------- mov r2, #3 tst r3, #1 @ if (OPL->lfo_am_depth) moveq r2, #5 tst r1, #8 @ AM bit set? addne r9, r3, lsr r2 @ Now r9 = SLOT2 volume base (0..511?) orr r12, r1, lsl #8 @ Put all bit values of SLOT2 into r12 second byte orr r12, #((ADLIB_BUFFER_SAMPLES-1)<<20) .for_SLOT2: @ for( i=length-1; i >= 0 ; i-- ) { @------- @ Calculate the FM part for SLOT 2 @ @ env = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask)); @ if( env < ENV_QUIET ) @ { @ UINT32 p = (env<<4) + sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (phase_modulation<<16))) >> FREQ_SH ) & SIN_MASK) ]; @ if (p >= TL_TAB_LEN) @ output[0] += 0; @ else @ output[0] += tl_tab[p]; @ } @------- ldrh r3, [lr] @ r3 = either phase_modulation or output[0] add r2, r9, r8, lsr #16 @ r2 = env = ((SLOT)->TLL + ((UINT32)(SLOT)->volume) + (LFO_AM & (SLOT)->AMmask)); tst r12, #(1<<(8+4)) @ If Con=1, op1 produces sound directly, else use it as phase modulation addeq r1, r7, r3, lsl #16 @ r1 = ((SLOT->Cnt) + (phase_modulation<<16)) moveq r3, #0 movne r1, r7 lsr r1, #16 @ r1 >>= FREQ_SH bic r1, #0xFC00 @ r1 &= SIN_MASK ldr r1, [r10, r1, lsl #2] @ r1 = sin_tab[SLOT->wavetable + ((((signed int)((SLOT->Cnt & ~FREQ_MASK) + (phase_modulation<<16))) >> FREQ_SH ) & SIN_MASK) ]; add r1, r2, lsl #5 @ r1 = env<<4 + sin_tab[..], extra << 1 for halfword accessing cmp r1, #(2*TL_TAB_LEN) @ if (p < TL_TAB_LEN) ldrloh r1, [r11, r1] addlo r3, r1 @ output[0] += tl_tab[p]; @------- @ Store the sample to output buffer @ @ lt = output[0]; @ lt >>= FINAL_SH; @ /* limit check */ @ lt = limit( lt , MAXOUT, MINOUT ); @ /* store to sound buffer */ @ buf[i] = lt; @------- strh r3, [lr], #2 @ buf[i] = output[0]; @------- @ Calculate envelope for SLOT 2 @------- tst r5, #1 @ Are we in ATTACK phase? bne op2_attack @ Yes, go handle ATTACK phase volume envelope add r8, r5, lsr #1 @ Decrease the volume by the envelope counter cmp r8, r4 @ Did we go under the SUSTAIN level? bhi op2_sustain @ Yep, go adjust the volume op2_env_done: @------- @ Calculate phase generator values for SLOT 2 @ @ /* Phase Generator */ @ if(op->vib) @ { @ unsigned int block_fnum = CH->block_fnum; @ unsigned int fnum_lfo = (block_fnum&0x0380) >> 7; @ signed int lfo_fn_table_index_offset = lfo_pm_table[LFO_PM + 16*fnum_lfo ]; @ if (lfo_fn_table_index_offset) /* LFO phase modulation active */ @ { @ block_fnum += lfo_fn_table_index_offset; @ UINT8 block = (block_fnum&0x1c00) >> 10; @ op->Cnt += (OPL->fn_tab[block_fnum&0x03ff] >> (7-block)) * op->mul; @ } @ else /* LFO phase modulation = zero */ @ op->Cnt += op->Incr; @ } @ else /* LFO phase modulation disabled for this operator */ @ op->Cnt += op->Incr; @------- add r7, r6 @ SLOT->Cnt += SLOT->Incr @------- @ Loop to next sample @------- subs r12, #(1<<20) bpl .for_SLOT2 @ } add r12, #(1<<20) @------- @ Save the final values for SLOT2 @------- str r4, [r0, #(ch0_slot1_env_sustain-SLOT1)] @ r4 = SLOT1 sustain level (MAX_ATT_INDEX if release phase) str r5, [r0, #(ch0_slot1_envelope-SLOT1)] @ r5 = SLOT1 envelope value str r7, [r0, #(ch0_slot1_Cnt-SLOT1)] @ r7 = SLOT2 Cnt value str r8, [r0, #(ch0_slot1_volume-SLOT1)] @ r8 = SLOT2 volume value << 16 @------- @ Go handle the next channel unless this was already the last channel. @------- add r0, #SLOT_SIZE add lr, #(ADLIB_BUFFER_SIZE) add r12, #0x00010000 @ channel++ and r4, r12, #0x000F0000 tst r12, #4 @ Do we have rhythm mode on? moveq r5, #0x00080000 @ Nope, handle 9 melodic channels movne r5, #0x00060000 @ Yep, handle 6 melodic channels cmp r4, r5 blt .for_channel |
Pate
_________________
- Now working on DSx86 http://dsx86.patrickaalto.com
- Get LineWarsDS from http://linewars.patrickaalto.com