gbadev.org forum archive

Hi, I've got a loop I'd like to optimize.

global.screens are main ram buffers, 256*192 uint8's in length.

Code:

//The 8 bit version, which takes about 15ms
int j=256*192;
uint8* dest=global.screens[0];
uint8* toplayer=global.screens[2];
uint8* botlayer=global.screens[1];

while(j--)
{
if(*toplayer == 0)
*dest = *botlayer;
else
*dest = *toplayer;
dest++;
botlayer++;
toplayer++;
}

The next thing I tried was doing 2 bytes at a time, although optimizing from here I get a little ham fisted, which is why Im asking for advice.

Code:

//16bit version, which takes about 9ms
int j=(256*192)/2;
uint16* dest=(uint16*)global.screens[0];
uint16* toplayer=(uint16*)global.screens[2];
uint16* botlayer=(uint16*)global.screens[1];

uint16 topA,topB,top;
while(j--)
{
top = *toplayer;
if(top ==0)
{
   *dest = *botlayer;
}
else
{
   topA =*toplayer & 0xFF00;
   topB =*toplayer & 0x00FF;

   if (topA && topB )
   {
      *dest= *toplayer;
   }
   else
   if (!topA && topB)
   {
      *dest= topB | (*botlayer & 0xFF00);
   }
   else
   {
      *dest= topA | (*botlayer & 0x00FF);
   }
}
dest++;
botlayer++;
toplayer++;
}

I would like to see the assembly.
In cases of these fast followed short if/then it would be interesting to see if conditioned code is produced rather then branching.

It might also be faster to increase only an indexing variable and use indirect addressing rather then increasing all 3 base pointers.

But it all depends on what the compiler already optimizes.
_________________
GBAMP Multiboot

If I could actually get the -S flag to work with my makefile, this would be really interesting.

-O3 and the 2nd version runs about 10x faster... (on the pathological best case of an empty top layer)

Last edited by TwentySeven on Sun Mar 22, 2009 1:33 pm; edited 1 time in total

Try "-save-temps"

-S does not work together with -g
_________________
GBAMP Multiboot

Oh, thanks, that did it.

Grabbing the output now.

This is the o2 output for the 16 bit version

Code:

.Ltext0:
.align 2
.global _Z21pixelscreen_Compositev
.type _Z21pixelscreen_Compositev, %function
_Z21pixelscreen_Compositev:
.fnstart
.LFB107:
.file 1 "c:/projects/dsclient/source/Pixelscreen.cpp"
.loc 1 149 0
@ Function supports interworking.
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
.LBB11:
.loc 1 173 0
ldr r1, .L12
ldr r3, .L12+4
.loc 1 175 0
ldr r2, .L12+8
.LBE11:
.loc 1 149 0
stmfd sp!, {r4, r5, r6, r7}
.save {r4, r5, r6, r7}
.LCFI0:
.LBB12:
.loc 1 173 0
ldr r4, [r1, r3]
.loc 1 174 0
add r3, r3, #8
.loc 1 175 0
ldr r6, [r1, r2]
.loc 1 174 0
ldr r7, [r1, r3]
.loc 1 175 0
mov r1, #0
b .L6
.LVL0:
.L11:
.loc 1 183 0
ldrh r3, [r6, r1]
strh r3, [r4, r1] @ movhi
.L3:
.loc 1 201 0
add r1, r1, #2
.loc 1 178 0
cmp r1, #49152
beq .L10
.L6:
.loc 1 180 0
ldrh r2, [r7, r1]
.loc 1 181 0
cmp r2, #0
beq .L11
.loc 1 190 0
ands r5, r2, #255
moveq r0, #0
movne r0, #1
ands ip, r2, #65280
moveq r3, #0
andne r3, r0, #1
cmp r3, #0
.loc 1 192 0
strneh r2, [r4, r1] @ movhi
.loc 1 190 0
bne .L3
.loc 1 195 0
cmp ip, #0
movne r3, #0
andeq r3, r0, #1
cmp r3, #0
.loc 1 197 0
ldrneh r3, [r6, r1]
.loc 1 201 0
ldreqb r3, [r6, r1] @ zero_extendqisi2
.loc 1 197 0
bicne r3, r3, #255
orrne r3, r5, r3
.loc 1 201 0
orreq r3, ip, r3
.loc 1 197 0
strneh r3, [r4, r1] @ movhi
.loc 1 201 0
streqh r3, [r4, r1] @ movhi
add r1, r1, #2
.loc 1 178 0
cmp r1, #49152
bne .L6
.L10:
.LBE12:
.loc 1 211 0
ldmfd sp!, {r4, r5, r6, r7}
bx lr
.L13:
.align 2
.L12:
.word global
.word 99976
.word 99980
.LFE107:
.fnend
.size _Z21pixelscreen_Compositev, .-_Z21pixelscreen_Compositev

For the sake of completeness, the -O3 output is here:

http://www.mydevstuff.com/twoseven/athia/o3Output.txt

There is still place for optimization.
Esp when merging the two components.

Something along

Code:

const int curTop = *toplayer ;
u16 mask = (curTop & 0xFF00)?0xFF00:0 ;
mask = mask | ((curTop & 0xFF)?0xFF:0 ;

if (mask == 0xFFFF)
{
*dest = curTop ; /* complete from top layer */
} else
{ /* merge from both */
*dest = (curTop & mask) | (*botlayer & ~mask) ;
}

This "should" remove the step over the boolean values and branching for them (topA and topB)
_________________
GBAMP Multiboot

Hmm I see what you're done there :)

Under -O3 it comes out to the same speed on the hardware, still about 8ms for ~70% best case.

Depending on where you have those buffers (doesn't seem to be vram if you previously did it by 8bit access) you can easily extend it to do 4 pixels each iteration with the masks.
That will however only make noticeable difference if you have a 32bit bus to the mem.
_________________
GBAMP Multiboot

Like this, right?

Thats about 12% faster.. 8ms down to 6-7ms - that implys I have a 32bit bus right?

Code:

int j=(256*192)/4;
uint32* dest=(uint32*)global.screens[0];
uint32* toplayer=(uint32*)global.screens[2];
uint32* botlayer=(uint32*)global.screens[1];

while(j--)
{

   if(*toplayer ==0)
   {
      *dest = *botlayer;
   }
   else
   {

      const int curTop = *toplayer ;
      u32 mask = (curTop & 0xFF000000) ? 0xFF000000 : 0;
      mask = mask | ((curTop & 0x00FF0000) ? 0x00FF0000 : 0);
      mask = mask | ((curTop & 0x0000FF00) ? 0x0000FF00 : 0);
      mask = mask | ((curTop & 0x000000FF) ? 0x000000FF : 0);
   if (mask == 0xFFFFFFFF)
      {
         *dest = curTop ; /* complete from top layer */
      } else
      { /* merge from both */
         *dest = (curTop & mask) | (*botlayer & ~mask) ;
      }

   }

   dest++;
   botlayer++;
   toplayer++;
}

I'd say so.
Cache might play in here too (Which is def. connected by the full bus width)
_________________
GBAMP Multiboot

One question: Why optimizing? Finish your game and then profile..if it's to slow.

Well if it comes to that, i'd use the hardware features.
But i somehow think this is part of some emulating.
_________________
GBAMP Multiboot

Haven't tried this yet, but it should be decent:

Code:

void bmp8AlphaBlit(const void *topv, const void *bottomv, void *dstv)
{
u32 *topD= (u32*)topv;
u32 *botD= (u32*)bottomv;
u32 *dstD= (u32*)dstv;

const u32 baseMask= 0x01010101;

for(int i=0; i<256*192/4; i++)
{
   u32 top= *topD++;
   u32 mask = top | top>>4;
   if(mask != 0)
   {
      mask |= mask>>2;
      mask |= mask>>1;
      mask = (mask&baseMask)*255;
      top |= *botD++ &~ mask;
   }
   else
      top= *botD++;

   *dstD++ = top;
}
}

Optimizing the loop condition in C is usually pointless. It'll usually be converted into an incrementing loop (i.e., for(;;i++) ) anyway. According to the manual, loads and stores should be 1 cycle if waitstates and interlocks aren't an issue. Special casing if mask==0xFFFFFFFF might actually cost a little more due to the extra instructions it requires. This will depend on caching and how random the top buffer's data is.

For the mask, don't use the ternary operator. The false-expression adds in unnecessary instructions. Instead, try a form of binary search pattern. if you have a byte with bit-pattern 'abcdefgh', x |= x>>4; combines both nybbles. x |= x>>2 then combines combines pairs and x |= x>>1 effectively makes bit 0 the inclusive OR over all 8 bits. remove the junk from the other bits and *255 for the final mask. The first step in this process is actually done early and used for the fully-transparent case, because if x==0, then x|x>>4 will be as well. (Hat tip to Cydrak for demonstrating this in a post looong ago)

You don't need to mask the top data. Since the mask was based on the top data, you'd just be masking out zeroes.

Unfortunately, the compiler doesn't quite give the best output, so here's the asm version as well. Add the initialization of r0-r2 if necessary.

Code:

.text
.arm
.align
.global .bmp8AlphaBlit
.bmp8AlphaBlit
@ reglist:
@ r0 : top ptr
@ r1 : bottom ptr
@ r2 : destination ptr
@ r3 : size
@ r4 : top / tmp data
@ r5 : bottom data
@ r6 : mask required for making the pixel mask
@ ip : pixel mask

stmfd sp!, {r4-r6}

ldr r3,=256*192/4
ldr r6,=0x01010101

.Lloop:
ldmia r4, {r0}
orrs ip, ip, r4, lsr #4 @ First step in mask prep
@ --- Full see-through ---
ldmeqia r4, {r1}
beq .Lwrite

@ --- Partial see-through ---
ldmia r5, {r1}
@ + Mask prep. Bits: abcdefgh
@ orrs ip, ip, r4, lsr #4 | (ae)(bf)(cg)(dh)
orr ip, ip, r4, lsr #2 @ | (aceg)(bdfh)
orr ip, ip, ip, lsr #1 @ | (abcdefgh)
and ip, ip, r6 @ | bit 0: 1 if any of abcdefgh were set
rsb ip, ip, ip, lsl #8 @ + *255

bic r5, r5, ip @ - top | (bottom &~ mask)
orr r4, r4, r5 @ /

.Lwrite:
subs r3, r3, #1
stmia r4, {r2} @ Delayed write.
bne .Lloop

ldmfd sp!, {r4-r6}
bx lr

.size bmp8AlphaBlit, .-bmp8AlphaBlit

Thankyou very much for your help, guys.

a128: Aye, and typically thats exactly how I work. Unfortunately I need to get software blitting up alot faster then my original naieve "it works" version, as theres other similar bits of code scheduled.

So this is more research, then development.

Theres bugger-all performance difference between the last 3 versions pasted here, the (beautiful...) hand asm'd version is 6 to 7ms.

Over twice as fast as the original byte-by-byte c version.

The bitmasking is quite elegant there, Cearn. :-)

For the check against 0xFFFFFFFF i'm not quite with you. It really depends on the average situation there tho, how often there will be a row of 4 solids.

The check itself is just a CMN rN,#1 taking 1S cycle. (added to all cases with at leat one visible top pixel)
This compare can skip (ldr with ne condition) the load of the bottom pixels (r5 in your code if its moved just before the mask is applied) which will reduces execution time in this case by 1N +1I cycle.

So depending on the amount filled in top that additional 1S might be helpfull.
_________________
GBAMP Multiboot

I guess you can cut out one of the ldm like this

Code:

.Lloop:
ldmia r0!, {r4}
orrs ip, ip, r4, lsr #4 @ First step in mask prep
@ --- Full see-through ---
ldmia r1!, {r5}
beq .Lwrite

@ --- Partial see-through ---
@ + Mask prep. Bits: abcdefgh
@ orrs ip, ip, r4, lsr #4 | (ae)(bf)(cg)(dh)
orr ip, ip, r4, lsr #2 @ | (aceg)(bdfh)
orr ip, ip, ip, lsr #1 @ | (abcdefgh)
and ip, ip, r6 @ | bit 0: 1 if any of abcdefgh were set
rsb ip, ip, ip, lsl #8 @ + *255

bic r5, r5, ip @ - top | (bottom &~ mask)
orr r5, r4, r5 @ /

.Lwrite:
subs r3, r3, #1
stmia r2!, {r5} @ Delayed write.
bne .Lloop

_________________
I probably suck, my not is a programmer.

Woops! I wasn't paying enough attention to my compiler output.

I havn't been able to make the hand-asm'd version work.
It's linking correctly, however, it doesnt appear to generate any output.

(the C version of the function was getting 6-7ms)

Any clue as to whats wrong?

TwentySeven wrote:

Woops! I wasn't paying enough attention to my compiler output.

I havn't been able to make the hand-asm'd version work.
It's linking correctly, however, it doesnt appear to generate any output.

(the C version of the function was getting 6-7ms)

Any clue as to whats wrong?

I seem to have made a few errors here. Treated ldm/stm as if they were ldr/str and had there should be no r4 in the ORR ... lsr #2 line. Together with Flubba's improvement (I swear there was a reason for the second bottom-load, but I really can't see what it was anymore).

Code:

.text
.arm
.align
.global .bmp8AlphaBlit
.bmp8AlphaBlit
@ reglist:
@ r0 : top ptr
@ r1 : bottom ptr
@ r2 : destination ptr
@ r3 : size
@ r4 : top / tmp data
@ r5 : bottom data
@ r6 : mask required for making the pixel mask
@ ip : pixel mask

stmfd sp!, {r4-r6}

ldr r3,=256*192/4
ldr r6,=0x01010101

.Lloop:
ldmia r0!, {r4}
ldmia r1!, {r5}
orrs ip, ip, r4, lsr #4 @ First step in mask prep, and C-thru test
beq .Lwrite

@ --- Partial see-through ---

orr ip, ip, ip, lsr #2 @ + rest of mask making
orr ip, ip, ip, lsr #1 @ | bit 0: 1 if any of bits 0-7 were set
and ip, ip, r6 @ |
rsb ip, ip, ip, lsl #8 @ + *255

bic r5, r5, ip @ - top | (bottom &~ mask)
orr r5, r4, r5 @ /

.Lwrite:
subs r3, r3, #1
stmia r2!, {r5} @ Delayed write.
bne .Lloop

ldmfd sp!, {r4-r6}
bx lr

.size bmp8AlphaBlit, .-bmp8AlphaBlit

If there are still problems, I'll try to run a test later to make sure it works.

Also note Mighty Max's point about the mask check. Unrolling a little may help too; I've noticed in the past that a double-load ldm is faster than two single ones, even though the docs say it shouldn't matter.

Well, it displays now, but the bottom layer doesn't show through, so I guess its still buggy?

Thanks again for trying, though!

Code:

orrs ip, ip, r4, lsr #4 @ First step in mask prep

Should be:

Code:

orrs ip, r4, r4, lsr #4 @ First step in mask prep

_________________
I probably suck, my not is a programmer.

gbadev.org forum archive

DS development > Optimization help?

#167684 - TwentySeven - Sun Mar 22, 2009 10:16 am

#167685 - Mighty Max - Sun Mar 22, 2009 10:45 am

#167687 - TwentySeven - Sun Mar 22, 2009 11:34 am

#167690 - Mighty Max - Sun Mar 22, 2009 12:00 pm

#167691 - TwentySeven - Sun Mar 22, 2009 12:10 pm

#167692 - TwentySeven - Sun Mar 22, 2009 12:13 pm

#167693 - TwentySeven - Sun Mar 22, 2009 12:18 pm

#167694 - Mighty Max - Sun Mar 22, 2009 12:39 pm

#167697 - TwentySeven - Sun Mar 22, 2009 1:36 pm

#167699 - Mighty Max - Sun Mar 22, 2009 2:02 pm

#167700 - TwentySeven - Sun Mar 22, 2009 2:27 pm

#167701 - Mighty Max - Sun Mar 22, 2009 2:45 pm

#167702 - a128 - Sun Mar 22, 2009 3:56 pm

#167704 - Mighty Max - Sun Mar 22, 2009 5:08 pm

#167705 - Cearn - Sun Mar 22, 2009 6:45 pm

#167706 - TwentySeven - Sun Mar 22, 2009 7:04 pm

#167707 - Mighty Max - Sun Mar 22, 2009 7:37 pm

#167708 - FluBBa - Sun Mar 22, 2009 9:16 pm

#167711 - TwentySeven - Mon Mar 23, 2009 2:54 am

#167714 - Cearn - Mon Mar 23, 2009 8:22 am

#167715 - TwentySeven - Mon Mar 23, 2009 9:53 am

#167716 - FluBBa - Mon Mar 23, 2009 1:31 pm