gbadev.org forum archive

Hey guys. I recently finished a stereo mixer which I should be about to release but it takes a bit long to mix down each channel. I was wondering if there's any room for improvement?

Code:

.section .iwram, "ax", %progbits
.align
.global S3MTmp

S3MTmp:
.space 304*2*16/8

.size S3MTmp, .-S3MTmp
.align

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

.section .iwram, "ax", %progbits
.align
.arm

.global S3MMixdown
.extern S3MBuffer
.extern S3MVrs

S3MMixdown:
.LStart:
stmfd sp!, {r4-r11, r14} @ save used registers

.LPreClearLoop:
ldr r1, =S3MTmp @ r1 = S3MBuffer
mov r2, r0 @ r2 = samples to mix
mov r3, #0x00 @ r3 = 0
mov r4, #0x00 @ r4 = 0
mov r5, #0x00 @ r5 = 0
mov r6, #0x00 @ r6 = 0

.LClearLoop:
stmia r1!, {r3-r6} @ store 4 samples
subs r2, r2, #0x04 @ subtract those 4 samples
bhi .LClearLoop @ if positive, then keep clearing

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

.LPreChannelLoop:
mov r1, #0x20 @ r1 = channel count (32)
ldr r2, =S3MVrs @ \
add r2, r2, #0x28 @ r2 = pointer to channels

.LChannelLoop:
ldr r3, [r2] @ r3 = cnt
tst r3, #0x01 @ \
beq .LChannelLoopEnd @ if(not active) dont do this channel

.LPreSampleLoop:
mov r4, r0 @ r4 = samples to mix
ldmib r2, {r5-r8} @ r5 = source
@ r6 = position
@ r7 = increment
@ r8 = length
ldr r9, =S3MTmp @ r9 = S3MTmp
ldr r11, =S3MVrs @ \
ldrb r12, [r11, #0x15] @ r10 = master volume

mov r10, r3, asr #0x02
and r10, r10, #0x7F
mul r10, r12, r10 @ r10 = channel vol (left) * master vol
mov r10, r10, asr #0x04 @ r10 = final volume

mov r11, r3, asr #0x09
and r11, r11, #0x7F
mul r11, r12, r11 @ r11 = channel vol (right) * master vol
mov r11, r11, asr #0x04 @ r11 = final volume

.LSampleLoop:
@ do left channel
add r12, r5, r6, lsr #0x0C @ r12 = position in ROM
ldrsb r12, [r12] @ r12 = sample
add r6, r6, r7 @ pos += inc
add r14, r5, r6, lsr #0x0C @ r14 = position in ROM
ldrsb r14, [r14] @ r14 = sample
add r6, r6, r7 @ pos += inc
add r12, r12, r14, lsl #0x10 @ r12 = 2 samples

ldr r14, [r9] @ r14 = mixed data
mla r14, r10, r12, r14 @ r14 = final data
str r14, [r9], #0x04 @ store mixed data and increment

@ do right channel
ldr r14, [r9] @ r14 = mixed data
mla r14, r11, r12, r14 @ r11 = final data
str r14, [r9], #0x04 @ store mixed data and increment

cmp r6, r8 @ \
bge .LSampleNoData @ if(pos >= length) do some stuff

.LSampleLoopEnd:
subs r4, r4, #0x02 @ \
bhi .LSampleLoop @ if(--samples left) keep looping

.LChannelLoopEnd:
str r3, [r2], #0x08 @ store cnt
str r6, [r2], #0x10 @ store pos; channel++
subs r1, r1, #0x01 @ \
bne .LChannelLoop @ if(--channels left != 0) loop again
b .LPreDownsampleLoop @ skip the end of sample handler

.LSampleNoData:
tst r3, #0x02 @ test for loop flag
ldrne r6, [r2, #0x14] @ if(looping) get loop begin position
bne .LSampleLoopEnd @ keep looping

bic r3, r3, #0x01 @ clear the active flag
b .LChannelLoopEnd @ stop looping

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

.LPreDownsampleLoop:
ldr r1, =S3MTmp @ r1 = S3MTmp
ldr r2, =S3MVrs @ r2 = S3MVrs
ldr r3, [r2, #0x04]! @ r3 = left buffer location
add r4, r3, #0x260 @ r4 = right buffer location
add r5, r3, r0 @ r4 = buffer location + samples mixed
str r5, [r2] @ store this location

.LDownsampleLoop:
ldrsh r5, [r1], #0x02 @ get mixed samples (left)
movs r5, r5, asr #0x07
cmnmi r5, #0x80
mvnlt r5, #0x7F
cmppl r5, #0x7F
movgt r5, #0x7F
strb r5, [r3], #0x01 @ store
ldrsh r5, [r1], #0x02 @ get mixed samples (left)
movs r5, r5, asr #0x07
cmnmi r5, #0x80
mvnlt r5, #0x7F
cmppl r5, #0x7F
movgt r5, #0x7F
strb r5, [r3], #0x01 @ store

ldrsh r5, [r1], #0x02 @ get mixed samples (right)
movs r5, r5, asr #0x07
cmnmi r5, #0x80
mvnlt r5, #0x7F
cmppl r5, #0x7F
movgt r5, #0x7F
strb r5, [r4], #0x01 @ store
ldrsh r5, [r1], #0x02 @ get mixed samples (right)
movs r5, r5, asr #0x07
cmnmi r5, #0x80
mvnlt r5, #0x7F
cmppl r5, #0x7F
movgt r5, #0x7F
strb r5, [r4], #0x01 @ store

subs r0, r0, #0x02
bcs .LDownsampleLoop

.LEnd:
ldmfd sp!, {r4-r11, r14} @ restore used registers
bx lr @ return

These are the structs

Code:

typedef struct tChn {
u32 cs; //channel status (bit0: on. bit1: loop. bit2-8: vol left. bit9-F: vol right)
s8* wp; //wave pointer
u32 cp; //channel position (20.12 fixed point)
u32 ci; //channel increment (20.12 fixed point)
u32 cl; //channel length (20.12 fixed point)
u32 lb; //loop begin (20.12 fixed point)
} tChn;

typedef struct tSndArea {
//mixer stuff
s8 *bb; //base buffer (+0)
s8 *mb; //mix buffer (current) (+4)

u32 fr; //mix frequency (+8)
u32 st; //samples til tick (20.12FP) (+C)
u32 sp; //samples per tick (20.12FP) (+10)

u8 ab; //active buffer (+14)
u8 mv; //master volume (default 15) (+15.. COINCIDENCE?! lol)
u16 sc; //sound control (+16)

//player stuff
tS3M *pm; //pointer to module (+18)
u8 *nb; //next byte (+1C)

u8 mp; //module speed (+20)
u8 mt; //module tempo (+21)
u8 mc; //mixer count/tick/tap (+22)

u8 cr; //current row (+23)
u8 co; //current order (+24)
u8 oc; //order count (+25)

tChn pc[32]; //player channels (+26)

u8 pd; //pattern delays left

u32 ef; //effect flags
u8 ep[32]; //effect paramaters
} tSndArea;

The mono code for the mixer is really fast but after I added those 3 instructions for the right channel, it's slowed down quite a lot. Thanks a lot guys.[/code]

EDIT: Fixed up the misaligned text

Loads, multiplies, and stores are some of the slowest instructions, so I'm not surprised that those 3 would add a lot.

I'm a bit rusty on mixer optimization and I haven't thorougly examined the code, but here are a few things I noticed:

One "free" speed boost you can have here is to swap the middle 2 registers on the multiplies. The cycles taken depends on the number of "significant" bytes in the SECOND multiplying register. Since you have 2 samples packed together, that register should go first, and the volume second, because the volume should be all zeroes in the upper 3 bytes.

That said, I'd recommend packing the left and right volumes together, rather than 2 samples. Saves at least the one add per 2 samples, and probably will save some registers too. But in that case, you would want to multiply with the volume register first, followed by the one byte sample.

Since you're not using a whole lot of registers even as it is, how about unrolling that loop 4 or 8 times and ldm/stm'ing the temp buffer values?

Also, make sure the temp buffer is in IWRAM. Even if it is pretty big, it's only needed during the mixer function, so it can go on the stack. Of course, if that's the only thing in your game that uses a lot of stack space, then it's just as bad as giving it a permanent chunk. But if a few other speed-critical sections can get some use out of a big stack, then there's no guilt in using it to speed up the mixer.
_________________
___________
The best optimization is to do nothing at all.
Therefore a fully optimized program doesn't exist.
-Deku

Code:

@ do left channel
add r12, r5, r6, lsr #0x0C @ r12 = position in ROM
ldrsb r12, [r12] @ r12 = sample
add r6, r6, r7 @ pos += inc
add r14, r5, r6, lsr #0x0C @ r14 = position in ROM
ldrsb r14, [r14] @ r14 = sample
add r6, r6, r7 @ pos += inc
add r12, r12, r14, lsl #0x10 @ r12 = 2 samples

You can add a shifted register to the LDRSB address, instead of using an add before the load.

Code:

@ do left channel
ldrsb r12, [r5, r6, lsr#0x0C] @ r12 = sample
add r6, r6, r7 @ pos += inc
ldrsb r14, [r5, r6, lsr#0x0C] @ r14 = sample
add r6, r6, r7 @ pos += inc
add r12, r12, r14, lsl #0x10 @ r12 = 2 samples

eKid wrote:

Code:

@ do left channel
add r12, r5, r6, lsr #0x0C @ r12 = position in ROM
ldrsb r12, [r12] @ r12 = sample
add r6, r6, r7 @ pos += inc
add r14, r5, r6, lsr #0x0C @ r14 = position in ROM
ldrsb r14, [r14] @ r14 = sample
add r6, r6, r7 @ pos += inc
add r12, r12, r14, lsl #0x10 @ r12 = 2 samples

You can add a shifted register to the LDRSB address, instead of using an add before the load.

Code:

@ do left channel
ldrsb r12, [r5, r6, lsr#0x0C] @ r12 = sample
add r6, r6, r7 @ pos += inc
ldrsb r14, [r5, r6, lsr#0x0C] @ r14 = sample
add r6, r6, r7 @ pos += inc
add r12, r12, r14, lsl #0x10 @ r12 = 2 samples

Nope, already tried that many times before. 'ldrb r0, [r1, r2, lsr #0x02]' is OK but 'ldrsb' doesn't work for it.

oh.. oops :P
My mixer takes unsigned samples... so I can use that trick. (aren't the samples in S3M unsigned?)

eKid wrote:

oh.. oops :P
My mixer takes unsigned samples... so I can use that trick. (aren't the samples in S3M unsigned?)

Yeah, they are but I XORed them with 128 'cause I couldn't get unsigned samples working right >.<" And besides, most games use signed samples so I thought I'd join in :D Lol.

Quote:

That said, I'd recommend packing the left and right volumes together, rather than 2 samples. Saves at least the one add per 2 samples, and probably will save some registers too. But in that case, you would want to multiply with the volume register first, followed by the one byte sample.

So, basically, load a sample into (i.e.) r0, multiply that sample by left vol into r1, multiply the sample by right vol into r2, add them together with a 'lsl #0x10' and store? If so, how? It confused me the second I tried it 'cause I'm using all the registers.

Quote:

Since you're not using a whole lot of registers even as it is, how about unrolling that loop 4 or 8 times and ldm/stm'ing the temp buffer values?

Umm... I suppose you mean that I don't use too many registers within the sample loop which means that I would need to stack one or two, right? 'Cause I'm using r0-r14 (except r13) atm...

Quote:

Also, make sure the temp buffer is in IWRAM. Even if it is pretty big, it's only needed during the mixer function, so it can go on the stack. Of course, if that's the only thing in your game that uses a lot of stack space, then it's just as bad as giving it a permanent chunk. But if a few other speed-critical sections can get some use out of a big stack, then there's no guilt in using it to speed up the mixer.

... Huh??????

Ruben wrote:

So, basically, load a sample into (i.e.) r0, multiply that sample by left vol into r1, multiply the sample by right vol into r2, add them together with a 'lsl #0x10' and store?

Not quite. Before going into the sample loop, load the left and right volumes, and add them together with lsl #0x10. Then inside the sample loop, a single mla multiplies one sample with both volumes, and adds to the result. Very similar to what you're doing, just multiplying 1 sample by 2 volumes, rather than 2 samples by 1 volume. You will end up with interleaved left/right samples, but that can be sorted out in the downsampling step.

Ruben wrote:

Umm... I suppose you mean that I don't use too many registers within the sample loop which means that I would need to stack one or two, right?

Yeah. The sample loop gets a whole lot of repetitions, so if stacking a few registers can save a few cycles per sample, do it.

Ruben wrote:

DekuTree64 wrote:

blah blah blah ... temp buffer ... IWRAM ... blah

... Huh??????

I'm talking about your global "S3MTmp", which seems to be the buffer that you're storing the intermediate mixed samples in before downsampling at the end. Since you have to load and store to it for every sample for every channel, putting it in fast memory saves a whole lot of cycles.

...but I just noticed that it's declared right there at the top, in IWRAM, so you can just leave it as is.
_________________
___________
The best optimization is to do nothing at all.
Therefore a fully optimized program doesn't exist.
-Deku

Quote:

Not quite. Before going into the sample loop, load the left and right volumes, and add them together with lsl #0x10. Then inside the sample loop, a single mla multiplies one sample with both...

Tried that and it actually slowed down the code... :S Maybe because it takes double the iterations?

Quote:

Yeah. The sample loop gets a whole lot of repetitions, so if stacking a few registers can save a few cycles per sample, do it.

Theoretically, I'm supposed to be saving around 1 or 2 cycles per sample but the CPU usage (according to no$gba) is still the same as when I used str twice. Right now I'm using double samples rather than left and right at the same time and I'm stacking r0 and r1 and using then as temps (r0: sample, r1: left sample, r12: right sample) then stmia'ing them to the temp buffer but it has the same speed. Should I still stack or should I revert back to what it was?

Oh, there's so many things! Unsigned mixing is definately faster, but it's a bit more complex since you need to track the DC offsets per frame rendered. Also, in my modplayer, I moved the looping-logic outside of the raw mixer. You can have a look at how that's done here. So, once you've done both of those optimizations, unrolling is your next friend, since the loads and stores from your temp-buffer become much faster, and you don't get that expensive branch for every sample.

Just a few small things:

Code:

mov r10, r3, asr #0x02
and r10, r10, #0x7F
mul r10, r12, r10 @ r10 = channel vol (left) * master vol
mov r10, r10, asr #0x04 @ r10 = final volume

mov r11, r3, asr #0x09
and r11, r11, #0x7F
mul r11, r12, r11 @ r11 = channel vol (right) * master vol
mov r11, r11, asr #0x04 @ r11 = final volume

Preload the 0x7F and use a shifted AND:

Code:

mov r14, 0x7F

and r10, r14, r3, asr #2
mul r10, r12, r10 @ r10 = channel vol (left) * master vol
mov r10, r10, asr #0x04 @ r10 = final volume

and r11, r14, r3, asr #9
mul r11, r12, r11 @ r11 = channel vol (right) * master vol
mov r11, r11, asr #0x04 @ r11 = final volume

Code:

ldr r14, [r9] @ r14 = mixed data
mla r14, r10, r12, r14 @ r14 = final data
str r14, [r9], #0x04 @ store mixed data and increment

@ do right channel
ldr r14, [r9] @ r14 = mixed data
mla r14, r11, r12, r14 @ r11 = final data
str r14, [r9], #0x04 @ store mixed data and increment

You can combine the left/right loads and stores. Assuming r10 is free:

Code:

ldmia r9, {r10, r14} @ Load left/right data
mla r10, r10, r12, r10 @ r14 = final data
mla r14, r11, r12, r14 @ r11 = final data
stmia r9!, {r10, 14} @ store mixed data and increment

About singed vs unsigned: the problem is probably the center-point. Signed data centers around 0, but in unsigned samples the zero-line is at 128. When you then add samples, the center-line shifts as well. For example, an empty unsigned sample will contain 128s. Adding two of them gives 256, which definitely ([/spelling police]) does not count as empty.

The true range is always [-M/2, +M/2), i.e. a signed sample (s). For unsigned samples (u), the bar is raised by M/2, so you have s = u-M/2. This should still be true after the summation.

Code:

Signed : S = ∑(0 to n) s[i]
vs
Unsigned: U - M/2 = ∑(0 to n) (u[i]-M/2)
U = ( ∑ u[i] ) - (n-1)*M/2

In other words, you have to subtract (n-1)*M/2 from the final sum over all unsigned samples.

Of course, since I hardly know anything about sound programming, I could be entirely wrong here :P

EDIT : forgot about volume.
Things are slightly different when the samples have different volumes. This scales the center-point as well:

Code:

U - M/2 = ∑(0 to n) V[i]*(u[i]-M/2)
U = ∑ V[i]*u[i] - (∑V[i])*M/2 + M/2

But yeah, unsigned should make things faster. The adds for the ldrsb can be combined into a single ldrb. It's also useful in the downsample loop, since clamping between 0 and a power of two can be done in two instructions rather than four.

Also:

Ruben wrote:

Code:

u8 cr; //current row (+23)
u8 co; //current order (+24)
u8 oc; //order count (+25)

tChn pc[32]; //player channels (+26)

u8 pd; //pattern delays left
u32 ef; //effect flags

Watch your member packing (and I'm talking about struct members here >_>). tChn requires 32bit alignment, so it's actually at 0x28. There is a 2-byte gap between oc and pc, as well as a 3-byte gap between pd and ef. Consider moving pd into the gap between oc and pc for a slightly smaller struct.

Wow! That was so much better than what I had expected! Thanks a MILLION guys!

What I ended up doing was following... uh... kusma's (?) advice on calling different functions for different things along with Cearn's ldmia/stmia suggestion. I'm not going to move onto unsigned samples 'cause I never understood how to properly mix them together so I may as well as stick to signed mode.

Now to figure out why the right speaker clicks... ROFL!

OK... umm... I went all out and thought of a faster method... this one:

Code:

.equ SND_BUFFERLEN, 304
.equ SND_ACCBITS, 12

.section .iwram, "ax", %progbits
.align
SndTmp:
.space SND_BUFFERLEN*2*16/8
.size SndTmp, .-SndTmp

.section .iwram, "ax", %progbits
.global SndMix
.extern sa__
.align
.arm

@ r0: samples to mix
SndMix:

r14 is a pointer to the channels, r12 is the number of channels left (16 now), and r0 is the samples to mix
...

.L1:
ldrb    r11, [r14]       @ \
tst    r11, #0x01       @ make sure channel is active
beq    .L5          @ /

stmfd    sp!, {r0,r12,r14}

ldrb    r1, [r14, #0x02]    @ \
ldrb    r2, [r14, #0x03]    @ r1: l/r volumes
add    r1, r1, r2, lsl #0x10    @ /
ldmib    r14, {r2-r4}       @ r2 = sample pointer
                  @ r3 = position
                  @ r4 = increment
add    r2, r2, #0x10
ldr    r5, [r2, #-0x04]    @ r5 = length

ldr    r6, =SndTmp       @ r6 = pointer to temp buffer
add    r7, r2, r3, lsr #SND_ACCBITS
add    r14, r4, lsr #SND_ACCBITS

mla    r8, r4, r0, r3
cmp    r5, r8, lsr #(SND_ACCBITS-2)
blt    .L2

.Lfast:
ldmia    r6, {r8-r11}       @ \
ldrsb    r12, [r7], r14       @ |
mla    r8, r12, r1, r8       @ |
ldrsb    r12, [r7], r14       @ |
mla    r9, r12, r1, r9       @ do 4 samples
ldrsb    r12, [r7], r14       @ |
mla    r10, r12, r1, r10    @ |
ldrsb    r12, [r7], r14       @ |
mla    r11, r12, r1, r11    @ /
stmia    r6!, {r8-r11}       @ store mixed samples
subs    r0, r0, #0x04
bhi    .Lfast

mov    r7, r0, lsl #0x02    @ \
mla    r3, r4, r7, r3       @ save new position
b    .L4

.L2:
ldr    r8, [r6]       @ get mixed samples
ldrsb    r12, [r7], r14       @ get the sample
mla    r8, r12, r1, r8       @ get final value
str    r8, [r6], #0x04       @ store mixed samples

add    r3, r3, r4
cmp    r5, r3, lsr #SND_ACCBITS
ble    .L6

.L3:
subs    r0, r0, #0x01
bhi    .L2

.L4:
ldmfd    sp!, {r0,r12,r14}
str    r3, [r14, #0x08]    @ store position

.L5:
add    r14, r14, #0x20       @ go to the next channel
subs    r12, r12, #0x01       @ channels left--
bne    .L1
b    .L7          @ if == 0, skip the EOF handler

.L6:
ldrh    r8, [r2, #-0x0E]    @ get sample stat
tst    r8, #0x4000       @ check if it's looping
ldrne    r3, [r2, #-0x08]    @ get loop point
movne    r3, r3, lsl #SND_ACCBITS @ put it in FP format
bne    .L3          @ keep looping

ldr    r7, [sp, #-0x04]
bic    r8, r8, #0x01       @ otherwise stop channel
strh    r8, [r7]       @ store channel control
b    .L4          @ stop looping
...

I thought that by incrementing by a set value using "ldrsb r0, [r1], r2", it would speed it up but it seems to be reading... ... ... garbage to say the least. Any ideas why?

I also changed the channel structure around like this:

Code:

typedef struct channel {
u8    sf; //status flags             (+00)
s8    pa; //panning             (+01)
u8    lv; //left vol (0->64)          (+02)
u8    rv; //right vol (0->64)          (+03)
voice*    wp; //voice pointer             (+04)
u32    po; //position             (+08)
u32    in; //increment             (+0C)
u8    wl; //waits left             (+10)
u8    pr; //program             (+11)
u8    vo; //volume             (+12)
u8    co; //chn. origin             (+13)
u8*    nb; //next byte             (+14)
u32    ja; //jump address             (+18)
u8    at; //attack envelope          (+1C)
u8    de; //decay envelope          (+1D)
u8    su; //sustain envelope          (+1E)
u8    re; //release envelope          (+1F)
} channel;

Code:

.Lfast:
ldmia r6, {r8-r11} @ \
ldrsb r12, [r7], r14 @ |
mla r8, r12, r1, r8 @ |
ldrsb r12, [r7], r14 @ |
mla r9, r12, r1, r9 @ do 4 samples
ldrsb r12, [r7], r14 @ |
mla r10, r12, r1, r10 @ |
ldrsb r12, [r7], r14 @ |
mla r11, r12, r1, r11 @ /
stmia r6!, {r8-r11} @ store mixed samples
subs r0, r0, #0x04
bhi .Lfast

I can't really understand how this works? Isn't r7 a fixed point number?

eKid wrote:

...
I can't really understand how this works? Isn't r7 a fixed point number?

r7 = sound wave position + (position >> accuracy)
r14 = increment >> accuracy

:P

Oh, I just found a small bug... "add r14, r4, lsr #SND_ACCBITS" should be "mov r14, r4, lsr #SND_ACCBITS". But it's still buggy...

Wouldn't shifting out all the accuracy bits cause a bunch of noise/failure? :)

eKid wrote:

Wouldn't shifting out all the accuracy bits cause a bunch of noise/failure? :)

Noise? Failure? o.O *confused*

What eKid means is that if you shift down before adding, the additions won't be in fixed point anymore and you stand to lose a lot of precision that way.

It's similar to this situation:

Code:

float a= 0.5;
int b= int(a+a+a+a); // b = 0.5+0.5+0.5+0.5 = 2

int a= 0.5; // cast to int removes sub-integer accuracy
int b = a+a+a+a; // b = 0+0+0+0 = 0

Also,

Code:

ldrb r11, [r14]
ldrb r1, [r14, #0x02]
ldrb r2, [r14, #0x03]
add r1, r1, r2, lsl #0x10

works a little faster like this: (10 cycles vs 6)

Code:

ldr r11, [r14] // R L x x
mov r1, r11, lsr #16 // 0 0 R L
orr r1, r1, r1, lsl #8 // 0 R x L
bic r1, r1, #0xFF00 // 0 R 0 L

And if you want to be really 1337, you can simply start with the optimal arrangement in the struct. That said, I doubt this is really the place where you can gain that much speed. You'll probably gain more from switching to unsigned data.

Nice... that is some serious thinking you can do! :P Anyway... is there anyway to contact you outside of the forum for code optimization? (I don't want to disclose the source just yet! :P) I'm pretty sure there's more room for optimization in the actual "sound.c" file so yeah.

gbadev.org forum archive

ASM > Any room for optimization?

#157777 - Ruben - Thu May 29, 2008 2:39 am

#157786 - DekuTree64 - Thu May 29, 2008 5:31 am

#157787 - eKid - Thu May 29, 2008 5:36 am

#157788 - Ruben - Thu May 29, 2008 5:42 am

#157789 - eKid - Thu May 29, 2008 5:45 am

#157790 - Ruben - Thu May 29, 2008 5:49 am

#157791 - Ruben - Thu May 29, 2008 6:01 am

#157792 - DekuTree64 - Thu May 29, 2008 6:26 am

#157793 - Ruben - Thu May 29, 2008 7:00 am

#157794 - kusma - Thu May 29, 2008 10:19 am

#157795 - Cearn - Thu May 29, 2008 11:15 am

#157796 - Ruben - Thu May 29, 2008 12:47 pm

#159170 - Ruben - Thu Jun 26, 2008 1:52 pm

#159171 - eKid - Thu Jun 26, 2008 2:07 pm

#159172 - Ruben - Thu Jun 26, 2008 2:11 pm

#159173 - Ruben - Thu Jun 26, 2008 2:19 pm

#159174 - eKid - Thu Jun 26, 2008 2:23 pm

#159176 - Ruben - Thu Jun 26, 2008 2:32 pm

#159179 - Cearn - Thu Jun 26, 2008 3:51 pm

#159213 - Ruben - Fri Jun 27, 2008 5:00 am