gbadev.org forum archive

This is a read-only mirror of the content originally found on forum.gbadev.org (now offline), salvaged from Wayback machine copies. A new forum can be found here.

ASM > Optimize Paletteinterpolation routine

#52604 - Peter - Thu Sep 01, 2005 8:23 am

Hi,

I wrote the following routine to interpolate between two 16 color palettes and store the interpolation in Vram. Unfortunately it's pretty slow and I would like to know if you have any advice for me how to get it faster.

Code:

.code 32
@.section .iwram, "ax", %progbits
.global  hel_PalInterpolate16
.type    hel_PalInterpolate16, %function
.align

@ r0 = pPaletteMemory, pointer to target palette (BG or OBJ palette in Vram)
@ r1 = pPaletteA, source palette
@ r2 = pPaletteB, source palette
@ r3 = Step, between 0..31
@
@ r4 = inner_loop Counter
@
@ extern void hel_PalInterpolate16(u16 *pPaletteMemory, const void *pPaletteA, const void *pPaletteB, u32 Step);

hel_PalInterpolate16:

  @mov r11, r11
   
  @ save registers on stack
  stmfd sp!, {r0-r10}

   
  @ set the loop counter. run from last entry down to 0.
  @ decrement by 2 each iteration
  mov r4, #30
 
  @ here starts the inner loop
  .inner_loop:
 
   ldrh r5, [r1, r4]      @ get u16 color value from pPaletteA, lets call it colorA
   ldrh r8, [r2, r4]      @ get u16 color value from pPaletteB, lets call it colorB
   
   @ we have both color values at this point.
   @ now we must extract their rgb components
   @ start with colorA
   mov r7, r5, lsr #10      @ get blue component from colorA
   mov r6, r5, lsr #5      @ get green component from colorA
   and r6, r6, #31
   and r5, r5, #31         @ get red component from colorA
 
   @ extract components from colorB now
   mov r10, r8, lsr #10   @ get blue component from colorB
   mov r9, r8, lsr #5      @ get green component from colorB
   and r9, r9, #31
   and r8, r8, #31         @ get red component from colorB
   
   @ now substract components of colorA from colorB
   @ to get the difference between each component.
   @ we store the result in the registers used by components
   @ from colorB, since we don't need them after this anymore
   sub r8, r8, r5         @ red
   sub r9, r9, r6         @ green
   sub r10, r10, r7      @ blue
   
   @ multiply difference with Step and shift 5 bits to the rigth again
   @ Step must be between 0..31. if Step equals 0, colors are entirely
   @ used from pPaletteA, if it is 31 colros entirely used from pPaletteB
   mul r8, r8, r3         @ red
   mul r9, r9, r3         @ green
   mul r10, r10, r3      @ blue
   
   @ now we add them to the components from pPaletteA
   @ and shift them back to have them in the right unitsize (0..31)
   add r5, r5, r8, lsr #5   @ red
   add r6, r6, r9, lsr #5   @ green
   add r7, r7, r10, lsr #5   @ blue
   
   @ now create a new 15bit bgr555 value
   @ r5 contains the red component, we basically logical OR the
   @ green and blue components and shift them to their required positions
   orr r5, r5, r6, lsl #5   @ green
   orr r5, r5, r7, lsl #10   @ blue
   
   @ now store the interpolated
   @ color to pPaletteMemory
   strh r5, [r0, r4]
   
   @ dec counter and loop until r4 equals zero
   subs r4, r4, #2
   bne .inner_loop


  @ load registers from stack
  ldmfd sp!, {r0-r10}
  bx    lr
 .size   hel_PalInterpolate16, .-hel_PalInterpolate16


I measured in 'ticks', where 4399 ticks equal one frame. The routine takes 60 ticks (when it is not located in iwram) and 15 ticks in iwram. I need it to flash 32 palettes in worst case, this is ~2000 ticks which equals almost an half frame. When it's located in iwram, it's of course a lot faster but I'm pretty sure it could be even faster when written with better asm skills than mine :)

Thanks for your helps

#52623 - strager - Thu Sep 01, 2005 1:21 pm

Peter wrote:
Code:
  @ save registers on stack
  stmfd sp!, {r0-r10}

Code:
  @ load registers from stack
  ldmfd sp!, {r0-r10}


Since registers 0 through 3 are expected to be crumbled, you could prevent those registers from being pushed and popped from the stack.

Peter wrote:
Code:
   @ we have both color values at this point.
   @ now we must extract their rgb components
   @ start with colorA
   mov r7, r5, lsr #10      @ get blue component from colorA
   mov r6, r5, lsr #5      @ get green component from colorA
   and r6, r6, #31
   and r5, r5, #31         @ get red component from colorA
 
   @ extract components from colorB now
   mov r10, r8, lsr #10   @ get blue component from colorB
   mov r9, r8, lsr #5      @ get green component from colorB
   and r9, r9, #31
   and r8, r8, #31         @ get red component from colorB


I believe this could be optimized an incy bit:
Code:
mov r10, #0x1F    @ Mask

@ Color A
and r5, r5, #0x1F         @ Red

and r6, r10, r5, lsr #5   @ Green
and r7, r10, r5, lsr #10  @ Blue

@ Color B
and r9, r10, r8, lsr #5   @ Green
and r10, r10, r8, lsr #10 @ Blue
and r8, r8, #0x1F         @ Red

One less instruction. :)

Another note:
Peter wrote:
Code:
   ldrh r5, [r1, r4]      @ get u16 color value from pPaletteA, lets call it colorA
   ldrh r8, [r2, r4]      @ get u16 color value from pPaletteB, lets call it colorB

Code:
   @ now store the interpolated
   @ color to pPaletteMemory
   strh r5, [r0, r4]


Instead of indexing with r4, you could increment the pointer:
Code:
ldrh r5, [r1], #1  @ Color A
ldrh r8, [r2], #1  @ Color B

Code:
strh r5, [r0], #1


I also see a potential bug. When the loop counter is zero upon loop entry, the function will subtract 2, making it 0xFFFFFFFE, which is non-zero. The function will then loop that many times.

More optimizations in store for the future (I must go =]).

Happy optimizing!

#52630 - tum_ - Thu Sep 01, 2005 2:30 pm

strager wrote:

Since registers 0 through 3 are expected to be crumbled, you could prevent those registers from being pushed and popped from the stack.


also you can use r12, no need to save it.


strager wrote:

Instead of indexing with r4, you could increment the pointer:
Code:
ldrh r5, [r1], #1  @ Color A
ldrh r8, [r2], #1  @ Color B

Code:
strh r5, [r0], #1



This gives you the idea but don't use this code, it's wrong ;)
Incrementing by 1 is no good for ldrh (not mentioning that you need
a DEcrement for your loop implementation).
I'd recommend to consider reading/processing/writing more than one
u16 at a time. That is, to use LDR (or even LDM) instead of LDRH.
This is definitely possible but, sorry, I can't write the code
for you at the moment.


strager wrote:

I also see a potential bug. When the loop counter is zero upon loop entry, the function will subtract 2, making it 0xFFFFFFFE, which is non-zero. The function will then loop that many times.


:-) the loop counter is being explicitly set to 30 upon loop entry...

#52640 - gladius - Thu Sep 01, 2005 4:33 pm

Essentially you are doing alpha blending on a 16 bit pixel, which is a subject that has been studied quite a bit.

Check out http://www.gamedev.net/reference/articles/article817.asp for a version that operates on two pixels at once, which saves you some multiplies, although I'm not sure how much exactly it will speed things up.

Wish I had time to code an asm version, maybe tonight :).

#52650 - tum_ - Thu Sep 01, 2005 6:00 pm

Peter wrote:


Code:

   @ now substract components of colorA from colorB
   @ to get the difference between each component.
   @ we store the result in the registers used by components
   @ from colorB, since we don't need them after this anymore
   sub r8, r8, r5         @ red
   sub r9, r9, r6         @ green
   sub r10, r10, r7      @ blue
   
   @ multiply difference with Step and shift 5 bits to the rigth again
   @ Step must be between 0..31. if Step equals 0, colors are entirely
   @ used from pPaletteA, if it is 31 colros entirely used from pPaletteB
   mul r8, r8, r3         @ red
   mul r9, r9, r3         @ green
   mul r10, r10, r3      @ blue
   
   @ now we add them to the components from pPaletteA
   @ and shift them back to have them in the right unitsize (0..31)
   add r5, r5, r8, lsr #5   @ red
   add r6, r6, r9, lsr #5   @ green
   add r7, r7, r10, lsr #5   @ blue



Does the function work correctly? Have you tested it on different inputs?
Don't want to get deep into the task but those subtractions look a bit suspicious to me. What if colorB components are less then corresponding
colorA ones - you get negative results in r8,r9,r10, then you multiply them... Are you sure you'll still get the correct result?

#52716 - Peter - Fri Sep 02, 2005 8:31 am

Hello, thanks for all the feedback! I'll implement your suggestions probably this weekend, eeks :)

@tum_
i only tested 6 different palettes yet. But I thought the same about what happens when B is smaller then A. However, it works so far. When anything weird happens I know where to look first :P

Again, thanks!

#53294 - Cearn - Wed Sep 07, 2005 9:16 am

_tum wrote:
Don't want to get deep into the task but those subtractions look a bit suspicious to me. What if colorB components are less then corresponding
colorA ones - you get negative results in r8,r9,r10, then you multiply them... Are you sure you'll still get the correct result?

Negative numbers are fine, that's what's supposed to happen. That said, shifting does should have been done with an ASR instead of an LSR, but since this only affect the top 5 bits (which you're not using), everything's fine.
What isn't fine is the [0,31] range. The step (blend weight, whatever) is a basically .5 fixed number, but should have the range between 0 and 1 inclusive. Meaning [0,32]. It's not a major concern, but strictly speaking the former range isn't accurate.
Also, correct me if I'm wrong, but doesn't using 30 as a starting range only do 15 interpolations?

As for optimisations, the original basically has 24 instructions inside the loop for 34 cycles (24i/34c). I have two alternatives:
* pal_blend16, doing a single color/loop (20i/28c)
* pal_blend32, doing two colors/loop (24i/38c, so effectively 19c per individual color)

Code:
Basic format:
@ DECL: void pal_blend16(COLOR *dst, COLOR *srca, COLOR *srcb,
@    u32 alpha, int nclrs) CODE_IN_IWRAM;
@ DECL: void pal_blend32(COLOR *dst, COLOR *srca, COLOR *srcb,
@    u32 alpha, int nclrs) CODE_IN_IWRAM;
@ DESC: blends palettes srca and srcb into dst.
@   \param dst. Destination palette
@   \param srca, srcb. Source palettes
@   \param alpha. Blend weight 0-32
@   \param nclrs. Number of colors

pal_blend16 is probably a little non-standard, with only a single MUL for the interpolation, rather than the usual three. The color components are pulled apart to a 10.10.10 format; multiplying by up to 32 will fill up these 10 bits/component without upsetting the others. Yes, this works. Quite fast too.
Also, here's a fun little fact about MUL. The format for this is "mul Rd, Rm, Rs" for Rd= Rm*Rs. The number of significant bits of Rs determines the full time; if that's low, the multiplication is faster. The order of multiplicants matters: I'd lose 3 cycles if it were reversed. Thank you GBATek for mentioning this.
Code:
@ --- Reglist: ---
@   r0, r1, r2: dst, srca, srcb
@   r12: nclrs
@   r4, r5: colors a & b
@   r6: mask
@   r7 temp
@   r3: alpha
    .section .iwram,"ax", %progbits
    .align 2
    .code 32
    .global pal_blend16
pal_blend16:
    ldr     r12, [sp]   @ get nclrs from stack
    cmp     r12, #0     @ Ain't workin' if there's
    bxeq    lr          @ nothing to do.
    stmfd   sp!, {r4-r7}
    ldr     r6, =0x01F07C1F         @ load mask
.Lpbld3_loop:
    @ prep a
    ldrh    r7, [r1], #2            @ ---bgr
    and     r4, r6, r7, lsl #5      @ ---g--
    orr     r4, r4, r7, lsr #10     @ ---g-b
    and     r7, r7, r6              @ ---b-r
    orr     r4, r4, r7, lsl #20     @ -r-g-b
    @ prep b
    ldrh    r7, [r2], #2            @ ---bgr
    and     r5, r6, r7, lsl #5      @ ---g--
    orr     r5, r5, r7, lsr #10     @ ---g-b
    and     r7, r7, r6              @ ---b-r
    orr     r5, r5, r7, lsl #20     @ -r-g-b
    @ a and b are now 10.10.10 rgb with only the lower 5 bits
    @ filled. We can mul in range [0,32] with abandon
    @ c= a + w*(b-a)/32
    sub     r5, r5, r4
    mul     r7, r5, r3              @ NOTE: r7,r3,r5 takes longer
    add     r4, r4, r7, asr #5
    @ convert back to 15bit bgr
    and     r4, r4, r6              @ -r-g-b
    orr     r4, r4, r4, lsl #15     @ grbg-b
    mov     r5, r4, lsr #20         @ ----gr
    orr     r4, r5, r4, lsl #10     @ bg-bgr
    @ --- write blended, loop
    strh    r4, [r0], #2
    subs    r12, r12, #1
    bgt     .Lpbld3_loop
    ldmfd   sp!, {r4-r7}
    bx      lr

pal_blend32 is a pretty run of the mill double-color interpolater. Instruction count is reduced a bit by keeping the masks in registers for easier shifted-ANDing, and having color a's components in .5 fixed point so I could use MLA instead of separate MULs and ADDs.
The usual alignment matters for words apply here.
Code:
@ --- Reglist: ---
@   r0, r1, r2: (u32*)dst, (u32*)srca, (u32*)srcb
@   r3: alpha
@   r4, r5: work color components a & b
@   r6, r7: green/a & red/b masks (gm, rm)
@   r8, r9, r10: current colors (c=*dst, a=*srca, b=*srcb)
@   r12: nclrs
    .section .iwram,"ax", %progbits
    .align 2
    .code 32
    .global pal_blend32
pal_blend32:
    ldr     r12, [sp]               @ get nclrs from stack
    movs    r12, r12, lsr #1        @ adjust nclrs for u32 run
    bxeq    lr        @ only run if there's something to do
    stmfd   sp!, {r4-r10}
    mov     r7, #31
    orr     r7, r7, r7, lsl #16     @ rm= 0x001F001F
    mov     r6, r7, lsl #5          @ gm= 0x03E003E0
.Lpbld_loop:
    ldr     r9, [r1], #4            @ a0= *pa++
    ldr     r10, [r2], #4           @ b0= *pb++
    @ --- reds: rc = (ra*32 + alpha*(rb-ra))/32
    and     r4, r6, r9, lsl #5      @ a= ra<<5
    and     r5, r10, r7             @ b= rb
    sub     r5, r5, r4, lsr #5      @ b -= a>>5
    mla     r4, r5, r3, r4          @ a += alpha*b
    and     r8, r7, r4, asr #5      @ c= rm & (a>>5)
    @ --- greens: gc= ga + alpha*(gb-ga)/32
    and     r4, r9, r6              @ a= ga
    and     r5, r7, r10, lsr #5     @ b= gb>>5
    sub     r5, r5, r4, lsr #5      @ b -= a>>5
    mla     r4, r5, r3, r4          @ a += alpha*b
    and     r4, r4, r6              @ a &= gm
    orr     r8, r8, r4              @ c |= a
    @ --- blues: bc= ba + alpha*((bb-ba)/32)
    and     r4, r6, r9, lsr #5      @ a= ba>>5
    and     r5, r7, r10, lsr #10    @ b= bb>>10
    sub     r5, r5, r4, lsr #5      @ b -= a>>5
    mla     r4, r5, r3, r4          @ a += alpha*b
    and     r4, r4, r6              @ a &= gm
    orr     r8, r8, r4, lsl #5      @ c |= a<<5
    @ --- write blended, loop
    str     r8, [r0], #4            @ *pc++= c
    subs    r12, r12, #1
    bgt     .Lpbld_loop
    ldmfd   sp!, {r4-r10}
    bx      lr

#53322 - Cearn - Wed Sep 07, 2005 2:49 pm

20060807 full edit:

16 insns/main loop. Two colors/iteration, 30cycles. For single color version, change the color ldr/str to ldrh/strh and remove the initial halving of the counter.

Code:
/*
void pal_blend_asm(COLOR *pa, COLOR *pb, COLOR *pc,
    int nclrs, u32 alpha) CODE_IN_IWRAM;
//! Blends palettes pa and pb into pc.
//! \param pa, pb. Source palettes
//! \param pc. Destination palette
//! \param nclrs Number of colors
//! \param alpha Blend weight 0-32
//! \note u32 version, 2 clrs/loop. Loop: 16i/30c, Barrel shifter FTW.
*/
    .section .iwram,"ax", %progbits
    .align 2
    .arm
    .global pal_blend_asm
pal_blend_asm:
    movs    r3, r3, lsr #1          @ adjust nclrs for u32 run
    bxeq    lr                      @ quit on nclrs=0
    ldr     r12, [sp]               @ get alpha from stack
    stmfd   sp!, {r4-r10}
    ldr     r7, =0x03E07C1F         @ MASKLO: -g-|b-r
    mov     r6, r7, lsl #5          @ MASKHI: g-|b-r-
.Lpbld_loop:
        ldr     r8, [r0], #4            @ a= *pa++
        ldr     r9, [r1], #4            @ b= *pb++
        @ --- -g-|b-r
        and     r4, r6, r8, lsl #5      @ x/32: (-g-|b-r)
        and     r5, r7, r9              @ y: -g-|b-r
        sub     r5, r5, r4, lsr #5      @ z: y-x
        mla     r4, r5, r12, r4         @ z: (y-x)*w + x*32
        and     r10, r7, r4, lsr #5     @ blend(-g-|b-r)           
        @ --- b-r|-g- (rotated by 16 for cheapskatiness)
        and     r4, r6, r8, ror #11     @ x/32: -g-|b-r (ror16)
        and     r5, r7, r9, ror #16     @ y: -g-|b-r (ror16)
        sub     r5, r5, r4, lsr #5      @ z: y-x
        mla     r4, r5, r12, r4         @ z: (y-x)*w + x*32
        and     r4, r7, r4, lsr #5      @ blend(-g-|b-r (ror16))
        @ --- mix -g-|b-r and b-r|-g-
        orr     r10, r10, r4, ror #16
        @ --- write blended, loop
        str     r10, [r2], #4           @ *pc++= c
        subs    r3, r3, #1
        bgt     .Lpbld_loop       
    ldmfd   sp!, {r4-r10}
    bx      lr