#166907 - Lazy1 - Mon Feb 23, 2009 10:07 pm
Once again looking at my code to display a 1bpp image on the DS I noticed that my timing before was way off and my current method takes around 14ms to convert a 512x342 image to 8bpp.
I have made one small change to only draw a small window instead which is much faster but I feel there may be a more efficient way to do this.
Also, I moved the conversion to the arm7 to leave the arm9 free for other things.
Arm7 code:
Code: |
u32 FBConvTable[ 256 ];
void V_Start( void ) {
u32 Value = 0;
int i = 0;
for ( i = 0; i < 256; i++ ) {
Value = 0;
if ( ! ( i & BIT( 0 ) ) ) Value |= ( 1 << 24 );
if ( ! ( i & BIT( 1 ) ) ) Value |= ( 1 << 16 );
if ( ! ( i & BIT( 2 ) ) ) Value |= ( 1 << 8 );
if ( ! ( i & BIT( 3 ) ) ) Value |= 1;
FBConvTable[ i ] = Value;
}
}
void V_Draw( int Count, void* Userdata ) {
u32 Data[ 6 ];
u32* Gfx;
u8* Fb;
int Size;
int i;
int x;
int y;
int DrawWindow_Width;
int DrawWindow_Height;
if ( Count ) {
fifoGetDatamsg( FIFO_USER_01, Count, ( u8* ) Data );
Gfx = ( u32* ) Data[ 0 ];
Fb = ( u8* ) Data[ 1 ];
Size = ( int ) Data[ 2 ];
DrawWindow_Width = ( int ) Data[ 3 ];
DrawWindow_Height = ( int ) Data[ 4 ];
for ( y = 0; y < DrawWindow_Height; y++ ) {
for ( x = 0; x < DrawWindow_Width; x+= 8 ) {
*Gfx++ = FBConvTable[ *Fb >> 4 ];
*Gfx++ = FBConvTable[ *Fb++ ];
}
Gfx+= ( 512 - DrawWindow_Width ) / 4;
Fb+= ( 512 - DrawWindow_Width ) / 8;
}
fifoSendDatamsg( FIFO_USER_01, 1, ( u8* ) Data );
}
}
|
It's very simple so far and I still have to add window scrolling in but I wanted to get other people's thoughts on improving the speed/efficiency before going farther.
Unless no$gba is lying to me the arm7 takes around 190 hblanks to convert a 320x240 window, notify the arm9 and have the arm9 dma it to vram.
I'm also not sure how good of an arrangement this is having the arm7 doing the drawing.
Arm9 code:
Code: |
void V_Draw( void ) {
u32 Data[ 5 ];
int HStart = 0;
int HEnd = 0;
Data[ 0 ] = ( u32 ) VBuffer;
Data[ 1 ] = ( u32 ) macfb_bin;
Data[ 2 ] = macfb_bin_size;
Data[ 3 ] = DrawWindow_Width;
Data[ 4 ] = DrawWindow_Height;
HStart = HBlankCount;
fifoSendDatamsg( FIFO_USER_01, sizeof( Data ), ( u8* ) Data );
while ( ! _Arm7Complete );
dmaCopyHalfWords( 3, VBuffer, bgGetGfxPtr( HW2D_bg2 ), ( 512 * 342 ) );
HEnd = HBlankCount;
_Arm7Complete = 0;
iprintf( "Arm7 draw took %d hblanks.\n", ( HEnd - HStart ) );
}
|
#166910 - elhobbs - Mon Feb 23, 2009 10:41 pm
I am guessing that you intend to have the arm9 do some work while waiting for the arm7 to finish - the fast cpu waiting idle for the slow cpu do some work is definitely not going to help much ;) I would be surprised if any of the emulators properly emulate memory contention between the arm7 and arm9. still 14ms sounds like a lot to me...
I have never tried it myself but you may want to take a look a decompressing a compressed native ds format (I am guessing here that you are using the 1bit image to save memory?) directly to vram instead of messing with the arm7.
#166911 - Dwedit - Mon Feb 23, 2009 10:49 pm
Faster algorithm to turn 1bpp into 8bpp (assuming color number #1)
input is an array of bytes, output is an array of u32's
a=input[x];
x++;
b=a&0x0F;
b|=b<<7;
b|=b<<14;
b&=0x01010101;
output[y]=b;
y++;
b=a>>4;
b|=b<<7;
b|=b<<14;
b&=0x01010101;
output[y]=b;
y++;
Compile in ARM mode, probably into fast ram.
EDIT: Gcc can't generate good ASM code for this, so here it is:
Code: |
ldr r4,=0x01010101 @put this outside of the inner loop
ldrb r0,[r1],#1
and r2,r0,#0x0F
orr r2,r2,r2,lsl#7
orr r2,r2,r2,lsl#14
and r2,r2,r4
str r2,[r3],#4
mov r2,r0,lsr#4
orr r2,r2,r2,lsl#7
orr r2,r2,r2,lsl#14
and r2,r2,r4
str r2,[r3],#4
|
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#166912 - Dwedit - Mon Feb 23, 2009 11:12 pm
ASM code again...
This is for bit order where the LEAST significant bit is the first in a sequence of 8 bytes to output, which isn't the same as your code.
Code: |
@void conv_image(u32 *dest, u8 *src, int size);
conv_image:
@r0 = dest, r1 = src, r2 = size
stmfd sp!,{r4,r5,r6}
ldr r6,=0x01010101
0:
ldrb r3,[r1],#1
and r4,r3,#0x0F
orr r4,r4,r4,lsl#7
orr r4,r4,r4,lsl#14
and r4,r4,r6
mov r5,r3,lsr#4
orr r5,r5,r5,lsl#7
orr r5,r5,r5,lsl#14
and r5,r5,r6
stmia r0!,{r4,r5}
subs r2,r2,#1
bne 0b
ldmfd sp!,{r4,r5,r6}
bx lr
|
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#166914 - Dwedit - Tue Feb 24, 2009 12:45 am
Now I made big and little endian bit order versions.
_le outputs ones for each bit, starting with the least significant bit,
_be outputs ones for each bit, starting with the most significant bit. Looks like the BE version is even faster!
I still haven't tested these yet, but I'm positive they'll work.
Neglecting memory speed, it should process about 2272 pixels per scanline on the ARM9, so converting 320x240 should take about 34 scanlines.
Code: |
@void conv_image_le(u32 *dest, u8 *src, int size);
conv_image_le:
@r0 = dest, r1 = src, r2 = size
stmfd sp!,{r4,r5,r6}
ldr r6,=0x01010101
0:
ldrb r3,[r1],#1
and r4,r3,#0x0F
orr r4,r4,r4,lsl#7
orr r4,r4,r4,lsl#14
and r4,r4,r6
mov r5,r3,lsr#4
orr r5,r5,r5,lsl#7
orr r5,r5,r5,lsl#14
and r5,r5,r6
stmia r0!,{r4,r5}
subs r2,r2,#1
bne 0b
ldmfd sp!,{r4,r5,r6}
bx lr
@void conv_image_be(u32 *dest, u8 *src, int size);
conv_image_be:
@r0 = dest, r1 = src, r2 = size
stmfd sp!,{r4,r5,r6}
ldr r6,=0x01010101
0:
ldrb r3,[r1],#1
mov r4,r3,lsl#24
orr r4,r4,r4,lsr#9
orr r4,r4,r4,lsr#18
and r5,r6,r4
and r4,r6,r4,lsr#4
stmia r0!,{r4,r5}
subs r2,r2,#1
bne 0b
ldmfd sp!,{r4,r5,r6}
bx lr
|
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
Last edited by Dwedit on Tue Feb 24, 2009 1:15 am; edited 2 times in total
#166915 - Lazy1 - Tue Feb 24, 2009 1:12 am
conv_image_be Gives me 175 hblanks on hardware but unfortunately gives me a white screen.
EDIT: That's for 512x342, sorry.
#166916 - Lazy1 - Tue Feb 24, 2009 2:12 am
elhobbs wrote: |
I am guessing that you intend to have the arm9 do some work while waiting for the arm7 to finish - the fast cpu waiting idle for the slow cpu do some work is definitely not going to help much ;) I would be surprised if any of the emulators properly emulate memory contention between the arm7 and arm9. still 14ms sounds like a lot to me...
I have never tried it myself but you may want to take a look a decompressing a compressed native ds format (I am guessing here that you are using the 1bit image to save memory?) directly to vram instead of messing with the arm7. |
Well, I'd like to redo a port of Mini vMac and really try to get the speed up.
If I can get the arm7 to do all the drawing that will leave the arm9 free to emulate the 68k cpu.
The code Dwedit posted looks promising, the first ones produces backwards output but were very fast.
Combining that with only drawing part of the screen should make it fast enough to be on the arm7 with room to spare.
A long time ago tepples wrote a 1bpp software scaler which produces very nice output.
Dropping that on the arm7 would not slow down emulation as much, at most some frames would be dropped depending on how fast the scaling is.
#166918 - Dwedit - Tue Feb 24, 2009 2:28 am
I just made a test program which used conv_image_be, and it worked perfectly.
Make sure you didn't make any silly mistakes like getting the dest and src swapped, or specify the wrong size. Size is in source bytes, not pixels.
In my test program, I just gave the bit unpacker an address in VRAM as the destination.
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#166919 - Lazy1 - Tue Feb 24, 2009 3:17 am
Ah right, got it.
Thank you, it's a lot faster than what I had.
#166920 - Dwedit - Tue Feb 24, 2009 3:41 am
Unrolled version (nocash says it's 50% faster)
Source address must be word aligned
Nocash says the previous routine took 159,749 cycles for a 256x192 bitmap, the new version here takes 102,921 cycles for the same bitmap
Code: |
conv_image_be_unrolled:
stmfd sp!,{r4-r10}
ldr r10,=0x01010101
0:
ldr r9,[r1],#4
mov r3,r9,lsl#24
orr r3,r3,r3,lsr#9
orr r3,r3,r3,lsr#18
and r4,r10,r3
and r3,r10,r3,lsr#4
mov r5,r9,lsl#16
and r5,r5,#0xFF000000
orr r5,r5,r5,lsr#9
orr r5,r5,r5,lsr#18
and r6,r10,r5
and r5,r10,r5,lsr#4
mov r7,r9,lsl#8
and r7,r7,#0xFF000000
orr r7,r7,r7,lsr#9
orr r7,r7,r7,lsr#18
and r8,r10,r7
and r7,r10,r7,lsr#4
and r9,r9,#0xFF000000
orr r9,r9,r9,lsr#9
orr r9,r9,r9,lsr#18
and r12,r10,r9
and r9,r10,r9,lsr#4
stmia r0!,{r3-r9,r12}
subs r2,r2,#4
bgt 0b
ldmfd sp!,{r4-r10}
bx lr
|
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#166922 - Lazy1 - Tue Feb 24, 2009 3:55 am
That is amazing!
It works very well :D
#166924 - Lazy1 - Tue Feb 24, 2009 5:13 am
Is there any reason this wouldn't work on the arm7? no$ just crashes with "the rom image has crashed" and gives no explanation.
#166927 - Miked0801 - Tue Feb 24, 2009 6:54 am
No Arm9 specific op-codes there. No$ has bugs which cause it to flake occasionally. Add some nops and such to other areas of your code to get different alignment and try again.
#166928 - Miked0801 - Tue Feb 24, 2009 6:57 am
BTW, Dwedit: I think that can be made a bit faster yet. I'll look at it further in the morning when I'm not so tired :)
#166929 - Dwedit - Tue Feb 24, 2009 7:37 am
Preparing 2 words in 5 instructions after loading a byte is pretty damn good. Does the alternative involve generating an address in 1 instruction, then fetching two words from a 2KB lookup table?
Maybe even converting the 1bpp stream into 4bpp tiles for use in tile-mapped modes, like PocketNES or the other emulators?
Include "dirty regions" to update less information?
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#166938 - Lazy1 - Tue Feb 24, 2009 3:28 pm
I'm seriously completely amazed, your code is faster on the arm7 than mine was on the arm9.
To convert the entire 512x342 framebuffer it took only 139 hblanks.
The problem I was having earlier:
I copied the assembler source file to the arm7 directory without removing the line that puts it into itcm (oops!).
I wonder why no error was generated though, works perfectly now.
#166951 - Cearn - Tue Feb 24, 2009 5:59 pm
A slightly faster version is possible by pre-loading the byte mask. Going off of Dwedit's code:
Code: |
conv_image_be_unrolled:
stmfd sp!, {r4-r11}
ldr r10, =0x01010101
mov r11, 0xFF000000
0:
ldr r9,[r1], #4
@ bits 0-7
mov r3, r9, lsl #24
orr r3, r3, r3, lsr #9
orr r3, r3, r3, lsr #18
and r4, r10, r3
and r3, r10, r3, lsr #4
@ bits 8-15
and r5, r11, r9, lsl #16
orr r5, r5, r5, lsr #9
orr r5, r5, r5, lsr #18
and r6, r10, r5
and r5, r10, r5, lsr #4
@ bits 16-23
and r7, r11, r9, lsl #8
orr r7, r7, r7, lsr #9
orr r7, r7, r7, lsr #18
and r8, r10, r7
and r7, r10, r7, lsr #4
@ bits 24-31
and r9, r11, r9
orr r9, r9, r9, lsr #9
orr r9, r9, r9, lsr #18
and r12, r10, r9
and r9, r10, r9, lsr #4
stmia r0!, {r3-r9, r12}
subs r2, r2, #4
bgt 0b
ldmfd sp!, {r4-r11}
bx lr |
Yeah, it's only 2 less instructions in the loop, but every little bit helps. Using a large lookup table may be faster depending on processor and memory region.
#166975 - Miked0801 - Wed Feb 25, 2009 5:19 am
No, just looking to skim an instruction or two. Work is ^$&$^ busy right now though so when I get home, my mind is too mushy to attack. Give me another day.
Or a lookup table perhaps :)
#167034 - Lazy1 - Fri Feb 27, 2009 4:36 am
Out of curiosity, how would I scale the screen down by half in software?
My code produces something visible but obviously is incorrect since I just guessed.
http://img514.imageshack.us/my.php?image=output.png
I don't need any code but a general idea on how it's supposed to go would be very appreciated.
#167035 - Dwedit - Fri Feb 27, 2009 4:47 am
You sum 4 pixels and average them into one pixel. Your output levels could be either 0%, 25%, 50%, 75%, or 100% intensity.
Of course, in this situation you get the opportunity to use ClearType too.
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#167053 - Lazy1 - Fri Feb 27, 2009 10:48 pm
Do you mean take 2 pixels from the top and bottom rows or 4 pixels from the same row?
So far all I'm getting is washed out gray.
#167054 - TwentySeven - Fri Feb 27, 2009 10:52 pm
Eight pixels (4x2)
ABCD
EFGH
becomes two pixels
(A+B+E+F) / 4
and
(C+D+G+H) / 4
#167056 - Lazy1 - Sat Feb 28, 2009 12:14 am
I think I get it: http://img187.imageshack.us/my.php?image=outp2.png
Expensive though, but I'm sure there is room for improvement.
#167058 - TwentySeven - Sat Feb 28, 2009 1:12 am
If you're happy with that image result, the actual algorithm can be significantly sped up...
But theres other algos too, as someone mentioned before the DS LCD is suitable for sub-pixel rendering, although that only gives you your horizontal scaling, not vertical.
#167060 - Lazy1 - Sat Feb 28, 2009 1:35 am
I tried subpixel rendering but yeah, it doesn't look quite right.
One thing that differed in your instructions was the division by 4, that did not produce anything usable but 2 did.
Hopefully it can be sped up so the arm7 can run it within 16ms, I like the idea of a scaled display not impacting performance. :D
#167061 - Dwedit - Sat Feb 28, 2009 1:38 am
deleteme
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#167064 - Dwedit - Sat Feb 28, 2009 2:14 am
For subpixel rendering:
Assuming RGB subpixel ordering, 0 is black, 1 is white...
Switch Orange and Skyblue if using BGR subpixel orering
Code: |
00 - Black
00
10 or 00 - Dark Orange
00 10
01 or 00- Dark SkyBlue
00 01
11 or 00 or 10 or 01 - Gray
00 11 01 10
10 - Orange
10
01 - SkyBlue
01
10 or 11 - Light Orange
11 10
01 or 11 - Light SkyBlue
11 01
11 - White
11
|
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#167066 - TwentySeven - Sat Feb 28, 2009 3:03 am
Do you have RGB values for those? :)
#167067 - Dwedit - Sat Feb 28, 2009 3:32 am
They should be self evident enough from the color names...
In RGB:
Orange: #FF8000
DarkOrange: #804000
LiteOrange: #FFC080
SkyBlue: #0080FF
DarkSkyBlue: #004080
LiteSkyBlue: #80C0FF
Gray: #808080
White: #FFFFFF
Black: #000000
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."
#167130 - Lazy1 - Mon Mar 02, 2009 8:24 pm
I got it down to using 234 lines, still a bit too expensive though.
Can this C code be optimized?
Code: |
void V_Draw( void ) {
u8* Top = ( u8* ) macfb_bin;
u8* Bottom = Top + 64;
u32 OutBits = 0;
u32 Temp = 0;
u32* Gfx = ( u32* ) BG_BMP_RAM( 0 );
u32 TopBits;
u32 BottomBits;
int S;
int E;
int i;
S = HBlankCount;
for ( i = 0; i < macfb_bin_size; i++ ) {
TopBits = ( u32 ) ( *Top++ );
BottomBits = ( u32 ) ( *Bottom++ );
Temp = TopBits >> 6;
Temp+= BottomBits >> 6;
Temp>>= 1;
OutBits = Temp;
Temp = ( TopBits >> 4 ) & 0x03;
Temp+= ( BottomBits >> 4 ) & 0x03;
Temp>>= 1;
OutBits |= ( Temp << 8 );
Temp = ( TopBits >> 2 ) & 0x03;
Temp+= ( BottomBits >> 2 ) & 0x03;
Temp>>= 1;
OutBits |= ( Temp << 16 );
Temp = ( TopBits ) & 0x03;
Temp+= ( BottomBits ) & 0x03;
Temp>>= 1;
OutBits |= ( Temp << 24 );
*Gfx++ = OutBits;
}
E = ( HBlankCount - S );
iprintf( "Scaling took %d hblanks.\n", E );
}
|
#167131 - Miked0801 - Mon Mar 02, 2009 8:56 pm
Don't I/O read 8bits at a time. Read 32 bits source and dest once (if they are 4-byte aligned. If not, try to align them.)
If so:
Code: |
TopBits = *((u32 *) Top)++;
BottomBits = *((u32 *) Bottom)++;
OutBits = 0;
Temp = Top Bits >> 30;
Temp += BottomBits >> 30;
Temp >>= 1;
OutBits |= Temp
TopBits <<= 2;
BottomBits <<= 2;
|
Rinse and repeat for the next 15 entries.
Otherwise:
Code: |
Temp = TopBits >> 6;
Temp+= BottomBits >> 6;
Temp>>= 1;
OutBits |= Temp;
TopBits <<= 2;
BottomBits <<= 2;
...
|
and repeat the top code. Trading shifts for ands. The Shifts will generally be optimized better by the arm compiler (in ARM mode.) In reality, look at the asm output to be sure.
Or, setup a state table and run from there: The bigger your table, the faster it will run (within reason). A 256x256 entry table could do your everything for you with 2 adds, 1 shift, and a loopkup. Not bad eh?
Too big, then do a 16x16 table twice. 256 bytes and 4 adds, 3 shifts and an or.
#167226 - Lazy1 - Fri Mar 06, 2009 3:57 am
I changed it to only do 32bit reads/writes but now I'm not getting any useful images from my code.
I try to read in 32bits from the 1bpp framebuffer and output them 4 at a time to the ds framebuffer.
It's probably something incredibly stupid but I'm just not seeing it :/
Code: |
#define ExpandScaleDown( shift ) \
OutBits = 0;\
Temp = ( TopBits >> 30 );\
Temp+= ( BottomBits >> 30 );\
Temp>>= 1;\
TopBits<<= 2;\
BottomBits<<= 2;\
OutBits|= ( Temp << shift );
void V_Draw( void ) {
u32* Top = ( u32* ) macfb_bin;
u32* Bottom = ( u32* ) ( &macfb_bin[ 64 ] );
u32* Gfx = ( u32* ) BG_BMP_RAM( 0 );
u32 TopBits = 0;
u32 BottomBits = 0;
u32 Temp = 0;
u32 OutBits = 0;
int i;
int S;
int E;
S = HBlankCount;
for ( i = 0; i < macfb_bin_size; i+= 4 ) {
TopBits = *Top++;
BottomBits = *Bottom++;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
ExpandScaleDown( 0 );
ExpandScaleDown( 8 );
ExpandScaleDown( 16 );
ExpandScaleDown( 24 );
*Gfx++ = OutBits;
}
E = ( HBlankCount - S );
iprintf( "Scale took %d lines.\n", E );
}
|
#167236 - Miked0801 - Fri Mar 06, 2009 6:40 pm
A few questions:
Is macfb_bin 4-byte aligned when you receive it? If not, this will not work.
Also, you are not coding in Fortran77. Variable names S and E would earn a wrist slap from me where I work.
Also, that macro is just asking for bugs. Use inline code (after you debug it) instead and get type checking and ease of debugging.
Anyways, I rethought that problem and using a lookup table will probably win in performance anyways along with being easier to write and maintain.
Use this to generate a 256 entry lookup table
Code: |
for(int i=0;i<16;i++)
{
for(int j=0;j<16;j++)
{
int lowValue = ((i & 0x03) + (j & 0x03)) >> 1;
int highValue = ((i >> 2) + (j >> 2)) >> 1;
highValue <<= 2;
printf("0x%x, ", (lowValue | highValue));
}
printf("\n");
}
|
and try something like this:
Code: |
static const u8 kAverageLookup[256] = {//data from output};
u8 *Top = whatever;
u8 *Bottom = whatever_offseted;
for (int i=0; i<macfb_bin_size; i++)
{
// Grab the next 8 bits
u8 topBits = *Top++
u8 bottomBits = *Bottom++
// Get the low nibble of each stream. Place the bottom bits
// Recombine them with bottomBits in the high nibble of a lookup
u8 lowIndex = topBits & 0x0F;
u8 highIndex = bottomBits << 4;
int outBits = kAverageLookup[lowIndex | highIndex];
// Get the high nibble of each stream. Place the bottom bits
// Recombine them with bottomBits in the high nibble of a lookup
u8 lowIndex = topBits >> 4;
u8 highIndex = bottomBits & 0xF0;
int outBits |= kAverageLookup[lowIndex | highIndex];
// Output
*Gfx++ = OutBits;
}
|
If you can afford a 64k lookup, then you can take it a step further and go FAST:
Use this to generate a 64k entry lookup table (not completely tested)
Code: |
for(int i=0;i<256;i++)
{
for(int j=0;j<256;j++)
{
int nibble1 = ((i & 0x03) + (j & 0x03)) >> 1;
int nibble2 = (((i & 0x0C) >> 2) + ((j & 0x0C) >> 2)) >> 1;
int nibble3 = (((i & 0x30) >> 4) + ((j & 0x0C) >> 4)) >> 1;
int nibble4 = ((i >> 6) + (j >> 6)) >> 1;
int value = nibble1 | (nibble2 << 2) | (nibble3 << 4) | (nibble4 << 6);
printf("0x%x, ", value);
}
printf("\n");
}
|
Code: |
for (int i=0; i<macfb_bin_size; i++)
{
// Grab the next 8 bits
u8 topBits = *Top++
u8 bottomBits = *Bottom++
// Output
*Gfx++ = kAverageLookup[topBits | (bottomBits << 8)];
}
|
And that will be awesome fast. If that's not fast enough, then read in 4 bytes at a time and unroll the loop 4 times for another decent speed gain. Still, roughly 18 cycles per byte (not counting memory waitstates) as is will be pretty dang fast. The unroll with 4 byte read/write takes it down to around 13.5 cycles.
Hell even the "slow" 256 entry table will run at around 26 cycles per byte.
But back to your code, your passed in shift values are way wrong. They should be 0, 2, 4, 6, ... 30 and then output to gfx as a u32.
With care, it will run at around 29 cycles or so per byte from the metrics I was using on my examples. But it will not cache as efficiently as the table code and is much harder to read. Use a lookup.
#167361 - Lazy1 - Mon Mar 09, 2009 11:27 pm
Thank you very much for the help, a full screen scale now only takes 150 lines on the arm9.
Quote: |
Also, you are not coding in Fortran77. Variable names S and E would earn a wrist slap from me where I work.
|
Yeah, usually I try to make good variable names but if I'm just screwing around with something I'll let a few bad ones slip by.
#167362 - Miked0801 - Mon Mar 09, 2009 11:40 pm
Using the 256 byte table or the 64k table version?
#167363 - Lazy1 - Mon Mar 09, 2009 11:53 pm
64K table version.
Edit:
This is embarrassing, now I have a bug to find. Seems each line is being doubled.
I didn't notice at first since mistakenly the bg was set to 512x512.
#167366 - Miked0801 - Tue Mar 10, 2009 4:44 am
In my 64k generator:
int nibble3 = (((i & 0x30) >> 4) + ((j & 0x0C) >> 4)) >> 1;
should be:
int nibble3 = (((i & 0x30) >> 4) + ((j & 0x30) >> 4)) >> 1;
#167370 - Lazy1 - Tue Mar 10, 2009 8:07 am
I figured it out somehow and now it's down to only 69 lines on hardware to scale down the screen.
I did not use your code directly but you pointed me in the right path with the lookup table.
Thanks :D
#167381 - Miked0801 - Tue Mar 10, 2009 5:17 pm
Sweet. Can you publish your final code here so we can see? It's always fun to see how far a piece of code can go and going from 300+ lines to 69 lines is pretty sweet.
#167389 - Lazy1 - Tue Mar 10, 2009 7:17 pm
It's a little embarrassing since looking now my lookup table is actually 256Kb and not 64, I guess since I took the lookup table idea and sort of hacked into what I was doing before.
I really need to get more sleep :/
Code: |
u32 ScaledValueTable[ 256 * 256 ];
void V_SetupScaler( void ) {
int i;
int j;
u32 Value = 0;
u32 Temp = 0;
for ( i = 0; i < 256; i++ ) {
for ( j = 0; j < 256; j++ ) {
Temp = ( i >> 6 );
Temp+= ( j >> 6 );
Temp>>= 1;
Value = ( Temp );
Temp = ( i >> 4 ) & 0x03;
Temp+= ( j >> 4 ) & 0x03;
Temp>>= 1;
Value |= ( Temp << 8 );
Temp = ( i >> 2 ) & 0x03;
Temp+= ( j >> 2 ) & 0x03;
Temp>>= 1;
Value |= ( Temp << 16 );
Temp = ( i & 0x03 );
Temp+= ( j & 0x03 );
Temp>>= 1;
Value |= ( Temp << 24 );
ScaledValueTable[ i + ( j * 256 ) ] = Value;
}
}
}
void V_Draw( void ) {
u8* Top = ( u8* ) macfb_bin;
u8* Bottom = Top + 64;
u32* Gfx = ( u32* ) BG_BMP_RAM( 0 );
int HBlankStart = 0;
int HBlankEnd = 0;
int i = 0;
int j;
int x;
int y;
V_SetupScaler( );
HBlankStart = HBlankCount;
for ( y = 0; y < 170; y++ ) {
for ( x = 0; x < 256; x+= 4 ) {
*Gfx++ = ScaledValueTable[ *Top++ + ( *Bottom++ * 256 ) ];
}
Top+= 64;
Bottom+= 64;
}
HBlankEnd = HBlankCount;
iprintf( "Scale took %d lines.\n", ( HBlankEnd - HBlankStart ) );
}
|
I'm so tired I really don't remember how that came to be but criticism is always welcome.
On the arm7 it gets around 170 lines or so but takes up both vram banks C and D to hold the table.