gbadev.org forum archive

As discussed previously, YUV 4:2:0 -> RGB colour conversion is fairly slow. I previously wrote an assembly function to do it as fast as possible but it's buried in the source of tuna-viDS. It takes 3 input pointers, one each to the Y, U and V components, and writes out pixels in RGB15 format (with alpha bit NOT set). I use it with a pointer directly to 3 RAM banks displayed in frame buffer mode, for triple buffering purposes. You'll need to manually set the alpha bit if you want to use any other video mode, probably quickest to orr each pixel with 0x8000 in the ASM function. It uses ARM9 specific instructions so it won't assemble for the ARM7.

Has anyone achieved something faster?

Here it is in it's entirety, for those who want it (without the GPL infection from tuna-viDS):

Code:

# Copyright (c) 2008, Michael "Chishm" Chisholm
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the <organization> nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY <copyright holder> ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.equ SCALEBITS_OUT, 6
.equ SCALEBITS_DISP, 9
.equ MAX_RGB,    31
.equ WIDTH,       256
.equ X_STRIDE,    512
.equ Y_STRIDE,    384
.equ UV_STRIDE,    192

.equ B_U_FACTOR,    129    @ = ((2.018) * (1<<SCALEBITS_OUT) + 0.5)
.equ G_U_FACTOR,    25    @ = ((0.391) * (1<<SCALEBITS_OUT) + 0.5)
.equ G_V_FACTOR,    52    @ = ((0.813) * (1<<SCALEBITS_OUT) + 0.5)
.equ R_V_FACTOR,    102    @ = ((1.596) * (1<<SCALEBITS_OUT) + 0.5)
.equ U_FACTORS,    (G_U_FACTOR << 16) | (B_U_FACTOR)
.equ V_FACTORS,    (G_V_FACTOR << 16) | (R_V_FACTOR)
.equ RGB_Y_FACTOR, 74    @ = ((1.164) * (1<<SCALEBITS_OUT) + 0.5)

.align
.global yv12_to_rgb555_asm
.func yv12_to_rgb555_asm

@ void yv12_to_rgb555_asm
@ uint8_t * out_ptr
@ uint8_t * y_ptr
@ uint8_t * u_ptr
@ uint8_t * v_ptr
@ int height

yv12_to_rgb555_asm:

stmfd sp!, {r4-r11, lr}

@ cache colour factors into stack for quick access
ldr    r7, =U_FACTORS
ldr    r8, =V_FACTORS
stmfd sp!, {r7-r8}

@ combine Y factor and line count into one variable -- each needs only 16 bits
mov    r10, #RGB_Y_FACTOR    @ low 16 bits are Y factor
ldr    r12, [sp, #44]
sub    r12, #1
orr    r10, r10, r12, lsl #16    @ high 16 bits are line count

b_u0 .req r4
g_uv0 .req r5
r_v0 .req r6
rgb_y .req r7

column_loop:

row_loop:

ldmfd sp, {r7-r8}    @ load table offsets
ldrb r11, [r2], #1    @ load U value
ldrb r12, [r3], #1    @ load V value
@ Calculate colour differences
sub    r11, #128
sub    r12, #128
smultb g_uv0, r7, r11
smulbb r_v0, r7, r12
smulbb b_u0, r8, r11
smlatb g_uv0, r8, r12, g_uv0

@ top row
ldrh r11, [r1], #2    @ load Y value for 2 pixels

@ top left pixel luma
and    r12, r11, #0xFF
sub    r12, #16
smulbb rgb_y, r10, r12

@ red
mov    r12, #(1<<15)
add    r8, rgb_y, r_v0
movs r9, r8, asr #14
orreq r12, r8, lsr #9
orrgt r12, #MAX_RGB

@ green
sub    r8, rgb_y, g_uv0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #5
orrgt r12, #(MAX_RGB<<5)

@ blue
add    r8, rgb_y, b_u0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #10
orrgt r12, #(MAX_RGB<<10)

@ top right pixel luma
lsr    r11, r11, #8
sub    r11, #16
smulbb rgb_y, r10, r11

@ red
add    r8, rgb_y, r_v0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #16
orrgt r12, #(MAX_RGB<<16)

@ green
sub    r8, rgb_y, g_uv0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #21
orrgt r12, #(MAX_RGB<<21)

@ blue
add    r8, rgb_y, b_u0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #26
orrgt r12, #(MAX_RGB<<26)

@ store 2 pixels
str    r12, [r0], #4

@ bottom row
add    r11, r1, #Y_STRIDE
ldrh r11, [r11, #-2]    @ load Y value for 2 pixels

@ bottom left pixel luma
and    r12, r11, #0xFF
sub    r12, #16
smulbb rgb_y, r10, r12

@ red
mov    r12, #(1<<15)
add    r8, rgb_y, r_v0
movs r9, r8, asr #14
orreq r12, r8, lsr #9
orrgt r12, #MAX_RGB

@ green
sub    r8, rgb_y, g_uv0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #5
orrgt r12, #(MAX_RGB<<5)

@ blue
add    r8, rgb_y, b_u0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #10
orrgt r12, #(MAX_RGB<<10)

@ bottom right pixel luma
lsr    r11, #8
sub    r11, #16
smulbb rgb_y, r10, r11

@ red
add    r8, rgb_y, r_v0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #16
orrgt r12, #(MAX_RGB<<16)

@ green
sub    r8, rgb_y, g_uv0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #21
orrgt r12, #(MAX_RGB<<21)

@ blue
add    r8, rgb_y, b_u0
movs r9, r8, asr #14
moveq r8, r8, lsr #9
orreq r12, r8, lsl #26
orrgt r12, #(MAX_RGB<<26)

@ store 2 pixels
str    r12, [r0, #(X_STRIDE-4)]

tst    r0, #0x1FC
bne    row_loop

subs r10, #(2<<16)
add    r0, #(2*X_STRIDE - 2*WIDTH)
add    r1, #(2*Y_STRIDE - WIDTH)
add    r2, #(UV_STRIDE - (WIDTH / 2))
add    r3, #(UV_STRIDE - (WIDTH / 2))
bgt    column_loop

add    sp, #8
ldmfd sp!, {r4-r11, pc}

.endfunc

.pool

EDIT: Corrected advice on orr with 0x8000.
_________________
http://chishm.drunkencoders.com
http://dldi.drunkencoders.com

Last edited by chishm on Fri May 16, 2008 1:17 am; edited 1 time in total

Awesome :D
Now I just have to decide if it's better to stream MPEG-1 or MPEG-4.

chishm wrote:

You'll need to manually set the alpha bit if you want to use any other video mode, probably quickest to orr each pixel with 0x1000

0x8000?
_________________
I'm a PSP hacker now, but I still <3 DS.

HyperHacker wrote:

chishm wrote:

You'll need to manually set the alpha bit if you want to use any other video mode, probably quickest to orr each pixel with 0x1000

0x8000?

Ah yes, silly me. Of course I meant the highest bit (0x8000, or more easily, 1<<15).
_________________
http://chishm.drunkencoders.com
http://dldi.drunkencoders.com

chishm wrote:

Has anyone achieved something faster?

Here is my attempt. This should work on either the ARM7 or the ARM9. It's released under the GPL. If anyone wants it in under a different license, contact me.

Code:

@ YUV-> RGB conversion code Copyright (C) 2008 Robin Watts (robin@wss.co.uk).
@
@ Licensed under the GPL. If you need it under another license, contact me
@ and ask.
@
@ This program is free software ; you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation ; either version 2 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY ; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this program ; if not, write to the Free Software
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@
@
@ The algorithm used here is based heavily on one created by Sophie Wilson
@ of Acorn/e-14/Broadcomm. Many thanks.
@
@ Modifications are made here to allow for more bits of overflow (required
@ by the range of the B values due to the larger than expected B scalar).
@ Additional tweaks (in the fast fixup code) are from Paul Gardiner.
@
@ The old implementation of YUV -> RGB did:
@
@ R = CLAMP((Y-16)*1.164 + 1.596*V)
@ G = CLAMP((Y-16)*1.164 - 0.391*U - 0.813*V)
@ B = CLAMP((Y-16)*1.164 + 2.018*U )
@
@ We're going to use tables to do it faster, but rather than doing it using
@ 5 tables as as the above suggests, we're going to do it using just 3.
@
@ We do this by working in parallel within a 32 bit word, and using one
@ table each for Y U and V.
@
@ Source Y values are 0 to 255, so -19 to 278 after scaling
@ Source U values are -128 to 127, so -50.. 50(G), -258..256(B) after
@ Source V values are -128 to 127, so -204..203(R), -104..103(G) after
@
@ So total summed values:
@ -223 <= R <= 481, -173 <= G <= 431, -277 <= B <= 534
@
@ We need to pack R G and B into a 32 bit word, and because of Bs range we
@ need 2 bits above the valid range of B to detect overflow, and another one
@ to detect the sense of the overflow. We therefore adopt the following
@ representation:
@
@ ossGGGGGggggossBBBBBbbossRRRRRrr
@
@ Each such word breaks down into 3 ranges.
@
@ ossGGGGGgggg ossBBBBBbb ossRRRRRrr
@
@ Thus we have 7 bits for each B and R table entry, and 9 bits for G (good
@ as G is the most noticable one). The 2 s bits for each represent the sign,
@ and o represents the overflow.
@
@ For R and B we pack the table by taking the 11 bit representation of their
@ values, shifting it right by 1. We toggle bit 9 in the U and V tables.
@
@ For the green case we calculate 2*G (thus effectively using 9 bits for the
@ valid range) truncate to 12 bits. We toggle bit 11 in the Y table.

.equ UV_STRIDE, 192
.equ Y_STRIDE, 384
.equ X_STRIDE, 512
.equ WIDTH,    256

.align
.global yv12_to_rgb555_asm
.func yv12_to_rgb555_asm

@ void yv12_to_rgb555_asm
@ uint8_t * x_ptr
@ uint8_t * y_ptr
@ uint8_t * u_ptr
@ uint8_t * v_ptr
@ int height

yv12_to_rgb555_asm:

@ r0 = x_ptr
@ r1 = y_ptr
@ r2 = u_ptr
@ r3 = v_ptr
@ <> = height
STMFD r13!,{r4-r11,r14}

ADR r8, y_table
LDRB r14,[r13,#9*4]    @ r14 = height
ADD r9, r8,#1024 @ u_table
ADD r10,r9,#1024 @ v_table
LDR r4, =0x07C07C1F
LDR r5, =0x60060180
yloop:
SUB r14,r14,#WIDTH<<8 @ r14 = height-(WIDTH<<8)
xloop:
LDRB r11,[r2],#1    @ r11 = u = *u_ptr++
LDRB r12,[r3],#1    @ r12 = v = *v_ptr++
LDRB r7, [r1, #Y_STRIDE] @ r7 = y2 = y_ptr[stride]
LDRB r6, [r1],#1    @ r6 = y0 = *y_ptr++
LDR r11,[r9, r11,LSL #2] @ r11 = u = u_table[u]
LDR r12,[r10,r12,LSL #2] @ r12 = v = v_table[v]
LDR r7, [r8, r7, LSL #2] @ r7 = y2 = y_table[y2]
LDR r6, [r8, r6, LSL #2] @ r6 = y0 = y_table[y0]
ADD r11,r11,r12    @ r11 = uv = u+v

ADD r7, r7,r11    @ r7 = y0 + uv
ADD r6, r6,r11    @ r6 = y2 + uv
ANDS r12, r7, r5
TSTEQ r6, r5
BNE fix01
return01:
AND r7, r4, r7, LSR #2
AND r6, r4, r6, LSR #2
ORR r7, r7, r7, LSR #17
ORR r6, r6, r6, LSR #17
ADD r12,r0, #X_STRIDE
STRH r7, [r12]
LDRB r12,[r1, #Y_STRIDE] @ r12 = y3 = y_ptr[stride]
LDRB r7, [r1],#1    @ r6 = y1 = *y_ptr++
STRH r6, [r0],#2

LDR r12,[r8, r12,LSL #2] @ r7 = y3 = y_table[y2]
LDR r6, [r8, r7, LSL #2] @ r6 = y1 = y_table[y0]

ADD r7, r12,r11    @ r7 = y3 + uv
ADD r6, r6, r11    @ r6 = y1 + uv
ANDS r12, r7, r5
TSTEQ r6, r5
BNE fix02
return02:
AND r7, r4, r7, LSR #2
AND r6, r4, r6, LSR #2
ORR r7, r7, r7, LSR #17
ORR r6, r6, r6, LSR #17
ADD r12,r0, #X_STRIDE
STRH r7, [r12]
STRH r6, [r0],#2
ADDS r14,r14,#2<<8
BLT xloop

ADD r0, r0, #2*X_STRIDE - 2*WIDTH @ x_ptr to next line
ADD r1, r1, #2*Y_STRIDE - WIDTH @ y_ptr to next line
ADD r2, r2, #UV_STRIDE - (WIDTH/2)
ADD r3, r3, #UV_STRIDE - (WIDTH/2)

SUBS r14,r14,#2
BGT yloop

LDMFD r13!,{r4-r11, pc}
fix01:
@ r7 and r6 are the values, at least one of which has overflowed
@ r12 = r7 & mask = .ss.....ss.....ss.....
STMFD r13!,{r14}
AND r14,r5, r5, LSR #1 @ r14 = ..1......1......1.....
ORR r12,r12,r12,LSR #1 @ r12 = .sSx....sSx....sSx....
AND r12,r12,r14    @ r12 = ..S......S......S.....
SUB r12,r12,r12,LSR #7 @ r12 = ...SSSSS..SSSSS..SSSSS
ORR r7, r7, r12    @ r7 |= ...SSSSS..SSSSS..SSSSS
BIC r12,r14,r7, LSR #2 @ r12 = ..o......o......o.....
ADD r7, r7, r12,LSR #7 @ r7 = fixed value

AND r12, r6, r5    @ r12 = .ss.....ss.....ss.....
ORR r12,r12,r12,LSR #1 @ r12 = .sSx....sSx....sSx....
AND r12,r12,r14    @ r12 = ..S......S......S.....
SUB r12,r12,r12,LSR #7 @ r12 = ...SSSSS..SSSSS..SSSSS
ORR r6, r6, r12    @ r6 |= ...SSSSS..SSSSS..SSSSS
BIC r12,r14,r6, LSR #2 @ r12 = ..o......o......o.....
ADD r6, r6, r12,LSR #7 @ r6 = fixed value
LDMFD r13!,{r14}
B return01
fix02:
@ r7 and r6 are the values, at least one of which has overflowed
@ r12 = r7 & mask = .ss.....ss.....ss.....
AND r11,r5, r5, LSR #1 @ r11 = ..1......1......1.....
ORR r12,r12,r12,LSR #1 @ r12 = .sSx....sSx....sSx....
AND r12,r12,r11    @ r12 = ..S......S......S.....
SUB r12,r12,r12,LSR #7 @ r12 = ...SSSSS..SSSSS..SSSSS
ORR r7, r7, r12    @ r7 |= ...SSSSS..SSSSS..SSSSS
BIC r12,r11,r7, LSR #2 @ r12 = ..o......o......o.....
ADD r7, r7, r12,LSR #7 @ r7 = fixed value

AND r12, r6, r5    @ r12 = .ss.....ss.....ss.....
ORR r12,r12,r12,LSR #1 @ r12 = .sSx....sSx....sSx....
AND r12,r12,r11    @ r12 = ..S......S......S.....
SUB r12,r12,r12,LSR #7 @ r12 = ...SSSSS..SSSSS..SSSSS
ORR r6, r6, r12    @ r6 |= ...SSSSS..SSSSS..SSSSS
BIC r12,r11,r6, LSR #2 @ r12 = ..o......o......o.....
ADD r6, r6, r12,LSR #7 @ r6 = fixed value
B return02
.endfunc
.pool

y_table:
.word 0x7FFFFFF7
.word 0x7FFFFFF7
.word 0x7FFFFFF8
.word 0x7FFFFFF8
.word 0x7FFFFFF9
.word 0x7FFFFFFA
.word 0x7FFFFFFA
.word 0x7FFFFFFB
.word 0x7FFFFFFB
.word 0x7FFFFFFC
.word 0x7FFFFFFD
.word 0x7FFFFFFD
.word 0x7FFFFFFE
.word 0x7FFFFFFE
.word 0x7FFFFFFF
.word 0x7FFFFFFF
.word 0x80000000
.word 0x80200401
.word 0x80500401
.word 0x80700802
.word 0x80900802
.word 0x80C00C03
.word 0x80E00C03
.word 0x81001004
.word 0x81301405
.word 0x81501405
.word 0x81701806
.word 0x81A01806
.word 0x81C01C07
.word 0x81E02008
.word 0x82102008
.word 0x82302409
.word 0x82502409
.word 0x8280280A
.word 0x82A0280A
.word 0x82C02C0B
.word 0x82F0300C
.word 0x8310300C
.word 0x8330340D
.word 0x8360340D
.word 0x8380380E
.word 0x83A03C0F
.word 0x83D03C0F
.word 0x83F04010
.word 0x84104010
.word 0x84404411
.word 0x84604411
.word 0x84804812
.word 0x84A04C13
.word 0x84D04C13
.word 0x84F05014
.word 0x85105014
.word 0x85405415
.word 0x85605816
.word 0x85805816
.word 0x85B05C17
.word 0x85D05C17
.word 0x85F06018
.word 0x86206018
.word 0x86406419
.word 0x8660681A
.word 0x8690681A
.word 0x86B06C1B
.word 0x86D06C1B
.word 0x8700701C
.word 0x8720741D
.word 0x8740741D
.word 0x8770781E
.word 0x8790781E
.word 0x87B07C1F
.word 0x87E07C1F
.word 0x88008020
.word 0x88208421
.word 0x88508421
.word 0x88708822
.word 0x88908822
.word 0x88C08C23
.word 0x88E09024
.word 0x89009024
.word 0x89309425
.word 0x89509425
.word 0x89709826
.word 0x89A09826
.word 0x89C09C27
.word 0x89E0A028
.word 0x8A10A028
.word 0x8A30A429
.word 0x8A50A429
.word 0x8A80A82A
.word 0x8AA0A82A
.word 0x8AC0AC2B
.word 0x8AF0B02C
.word 0x8B10B02C
.word 0x8B30B42D
.word 0x8B60B42D
.word 0x8B80B82E
.word 0x8BA0BC2F
.word 0x8BD0BC2F
.word 0x8BF0C030
.word 0x8C10C030
.word 0x8C40C431
.word 0x8C60C431
.word 0x8C80C832
.word 0x8CB0CC33
.word 0x8CD0CC33
.word 0x8CF0D034
.word 0x8D20D034
.word 0x8D40D435
.word 0x8D60D836
.word 0x8D90D836
.word 0x8DB0DC37
.word 0x8DD0DC37
.word 0x8DF0E038
.word 0x8E20E038
.word 0x8E40E439
.word 0x8E60E83A
.word 0x8E90E83A
.word 0x8EB0EC3B
.word 0x8ED0EC3B
.word 0x8F00F03C
.word 0x8F20F43D
.word 0x8F40F43D
.word 0x8F70F83E
.word 0x8F90F83E
.word 0x8FB0FC3F
.word 0x8FE0FC3F
.word 0x90010040
.word 0x90210441
.word 0x90510441
.word 0x90710842
.word 0x90910842
.word 0x90C10C43
.word 0x90E11044
.word 0x91011044
.word 0x91311445
.word 0x91511445
.word 0x91711846
.word 0x91A11846
.word 0x91C11C47
.word 0x91E12048
.word 0x92112048
.word 0x92312449
.word 0x92512449
.word 0x9281284A
.word 0x92A1284A
.word 0x92C12C4B
.word 0x92F1304C
.word 0x9311304C
.word 0x9331344D
.word 0x9361344D
.word 0x9381384E
.word 0x93A13C4F
.word 0x93D13C4F
.word 0x93F14050
.word 0x94114050
.word 0x94414451
.word 0x94614451
.word 0x94814852
.word 0x94B14C53
.word 0x94D14C53
.word 0x94F15054
.word 0x95215054
.word 0x95415455
.word 0x95615856
.word 0x95915856
.word 0x95B15C57
.word 0x95D15C57
.word 0x96016058
.word 0x96216058
.word 0x96416459
.word 0x9671685A
.word 0x9691685A
.word 0x96B16C5B
.word 0x96D16C5B
.word 0x9701705C
.word 0x9721745D
.word 0x9741745D
.word 0x9771785E
.word 0x9791785E
.word 0x97B17C5F
.word 0x97E17C5F
.word 0x98018060
.word 0x98218461
.word 0x98518461
.word 0x98718862
.word 0x98918862
.word 0x98C18C63
.word 0x98E19064
.word 0x99019064
.word 0x99319465
.word 0x99519465
.word 0x99719866
.word 0x99A19866
.word 0x99C19C67
.word 0x99E1A068
.word 0x9A11A068
.word 0x9A31A469
.word 0x9A51A469
.word 0x9A81A86A
.word 0x9AA1AC6B
.word 0x9AC1AC6B
.word 0x9AF1B06C
.word 0x9B11B06C
.word 0x9B31B46D
.word 0x9B61B46D
.word 0x9B81B86E
.word 0x9BA1BC6F
.word 0x9BD1BC6F
.word 0x9BF1C070
.word 0x9C11C070
.word 0x9C41C471
.word 0x9C61C471
.word 0x9C81C872
.word 0x9CB1CC73
.word 0x9CD1CC73
.word 0x9CF1D074
.word 0x9D21D074
.word 0x9D41D475
.word 0x9D61D876
.word 0x9D91D876
.word 0x9DB1DC77
.word 0x9DD1DC77
.word 0x9E01E078
.word 0x9E21E078
.word 0x9E41E479
.word 0x9E71E87A
.word 0x9E91E87A
.word 0x9EB1EC7B
.word 0x9EE1EC7B
.word 0x9F01F07C
.word 0x9F21F47D
.word 0x9F51F47D
.word 0x9F71F87E
.word 0x9F91F87E
.word 0x9FC1FC7F
.word 0x9FE1FC7F
.word 0xA0020080
.word 0xA0220481
.word 0xA0520481
.word 0xA0720882
.word 0xA0920882
.word 0xA0C20C83
.word 0xA0E21084
.word 0xA1021084
.word 0xA1321485
.word 0xA1521485
.word 0xA1721886
.word 0xA1A21886
.word 0xA1C21C87
.word 0xA1E22088
.word 0xA2122088
.word 0xA2322489
.word 0xA2522489
.word 0xA282288A
.word 0xA2A22C8B
.word 0xA2C22C8B
u_table:
.word 0x0645FC00
.word 0x06360000
.word 0x06360400
.word 0x06260800
.word 0x06160C00
.word 0x06061000
.word 0x05F61400
.word 0x05F61800
.word 0x05E61C00
.word 0x05D62000
.word 0x05C62400
.word 0x05B62800
.word 0x05B62C00
.word 0x05A63000
.word 0x05963400
.word 0x05863800
.word 0x05863C00
.word 0x05764000
.word 0x05664400
.word 0x05564800
.word 0x05464C00
.word 0x05465000
.word 0x05365400
.word 0x05265800
.word 0x05165C00
.word 0x05166000
.word 0x05066400
.word 0x04F66800
.word 0x04E66C00
.word 0x04D67000
.word 0x04D67400
.word 0x04C67800
.word 0x04B67C00
.word 0x04A68000
.word 0x04A68400
.word 0x04968800
.word 0x04868C00
.word 0x04769000
.word 0x04669400
.word 0x04669800
.word 0x04569C00
.word 0x0446A000
.word 0x0436A400
.word 0x0426A800
.word 0x0426AC00
.word 0x0416B000
.word 0x0406B400
.word 0x03F6B800
.word 0x03F6BC00
.word 0x03E6C000
.word 0x03D6C400
.word 0x03C6C800
.word 0x03B6CC00
.word 0x03B6D000
.word 0x03A6D400
.word 0x0396D800
.word 0x0386DC00
.word 0x0386E000
.word 0x0376E400
.word 0x0366E800
.word 0x0356EC00
.word 0x0346F000
.word 0x0346F400
.word 0x0336F800
.word 0x0326FC00
.word 0x03170000
.word 0x03070400
.word 0x03070800
.word 0x02F70C00
.word 0x02E71000
.word 0x02D71400
.word 0x02D71800
.word 0x02C71C00
.word 0x02B72400
.word 0x02A72800
.word 0x02972C00
.word 0x02973000
.word 0x02873400
.word 0x02773800
.word 0x02673C00
.word 0x02674000
.word 0x02574400
.word 0x02474800
.word 0x02374C00
.word 0x02275000
.word 0x02275400
.word 0x02175800
.word 0x02075C00
.word 0x01F76000
.word 0x01E76400
.word 0x01E76800
.word 0x01D76C00
.word 0x01C77000
.word 0x01B77400
.word 0x01B77800
.word 0x01A77C00
.word 0x01978000
.word 0x01878400
.word 0x01778800
.word 0x01778C00
.word 0x01679000
.word 0x01579400
.word 0x01479800
.word 0x01479C00
.word 0x0137A000
.word 0x0127A400
.word 0x0117A800
.word 0x0107AC00
.word 0x0107B000
.word 0x00F7B400
.word 0x00E7B800
.word 0x00D7BC00
.word 0x00D7C000
.word 0x00C7C400
.word 0x00B7C800
.word 0x00A7CC00
.word 0x0097D000
.word 0x0097D400
.word 0x0087D800
.word 0x0077DC00
.word 0x0067E000
.word 0x0057E400
.word 0x0057E800
.word 0x0047EC00
.word 0x0037F000
.word 0x0027F400
.word 0x0027F800
.word 0x0017FC00
.word 0x00080000
.word 0xFFF80400
.word 0xFFE80800
.word 0xFFE80C00
.word 0xFFD81000
.word 0xFFC81400
.word 0xFFB81800
.word 0xFFB81C00
.word 0xFFA82000
.word 0xFF982400
.word 0xFF882800
.word 0xFF782C00
.word 0xFF783000
.word 0xFF683400
.word 0xFF583800
.word 0xFF483C00
.word 0xFF384000
.word 0xFF384400
.word 0xFF284800
.word 0xFF184C00
.word 0xFF085000
.word 0xFF085400
.word 0xFEF85800
.word 0xFEE85C00
.word 0xFED86000
.word 0xFEC86400
.word 0xFEC86800
.word 0xFEB86C00
.word 0xFEA87000
.word 0xFE987400
.word 0xFE987800
.word 0xFE887C00
.word 0xFE788000
.word 0xFE688400
.word 0xFE588800
.word 0xFE588C00
.word 0xFE489000
.word 0xFE389400
.word 0xFE289800
.word 0xFE289C00
.word 0xFE18A000
.word 0xFE08A400
.word 0xFDF8A800
.word 0xFDE8AC00
.word 0xFDE8B000
.word 0xFDD8B400
.word 0xFDC8B800
.word 0xFDB8BC00
.word 0xFDA8C000
.word 0xFDA8C400
.word 0xFD98C800
.word 0xFD88CC00
.word 0xFD78D000
.word 0xFD78D400
.word 0xFD68D800
.word 0xFD58DC00
.word 0xFD48E400
.word 0xFD38E800
.word 0xFD38EC00
.word 0xFD28F000
.word 0xFD18F400
.word 0xFD08F800
.word 0xFD08FC00
.word 0xFCF90000
.word 0xFCE90400
.word 0xFCD90800
.word 0xFCC90C00
.word 0xFCC91000
.word 0xFCB91400
.word 0xFCA91800
.word 0xFC991C00
.word 0xFC892000
.word 0xFC892400
.word 0xFC792800
.word 0xFC692C00
.word 0xFC593000
.word 0xFC593400
.word 0xFC493800
.word 0xFC393C00
.word 0xFC294000
.word 0xFC194400
.word 0xFC194800
.word 0xFC094C00
.word 0xFBF95000
.word 0xFBE95400
.word 0xFBE95800
.word 0xFBD95C00
.word 0xFBC96000
.word 0xFBB96400
.word 0xFBA96800
.word 0xFBA96C00
.word 0xFB997000
.word 0xFB897400
.word 0xFB797800
.word 0xFB697C00
.word 0xFB698000
.word 0xFB598400
.word 0xFB498800
.word 0xFB398C00
.word 0xFB399000
.word 0xFB299400
.word 0xFB199800
.word 0xFB099C00
.word 0xFAF9A000
.word 0xFAF9A400
.word 0xFAE9A800
.word 0xFAD9AC00
.word 0xFAC9B000
.word 0xFAC9B400
.word 0xFAB9B800
.word 0xFAA9BC00
.word 0xFA99C000
.word 0xFA89C400
.word 0xFA89C800
.word 0xFA79CC00
.word 0xFA69D000
.word 0xFA59D400
.word 0xFA59D800
.word 0xFA49DC00
.word 0xFA39E000
.word 0xFA29E400
.word 0xFA19E800
.word 0xFA19EC00
.word 0xFA09F000
.word 0xF9F9F400
.word 0xF9E9F800
.word 0xF9D9FC00
.word 0xF9DA0000
v_table:
.word 0x0D00019A
.word 0x0CF0019B
.word 0x0CD0019B
.word 0x0CB0019C
.word 0x0CA0019D
.word 0x0C80019E
.word 0x0C60019F
.word 0x0C50019F
.word 0x0C3001A0
.word 0x0C1001A1
.word 0x0C0001A2
.word 0x0BE001A3
.word 0x0BD001A3
.word 0x0BB001A4
.word 0x0B9001A5
.word 0x0B8001A6
.word 0x0B6001A7
.word 0x0B4001A7
.word 0x0B3001A8
.word 0x0B1001A9
.word 0x0B0001AA
.word 0x0AE001AB
.word 0x0AC001AB
.word 0x0AB001AC
.word 0x0A9001AD
.word 0x0A7001AE
.word 0x0A6001AF
.word 0x0A4001AF
.word 0x0A3001B0
.word 0x0A1001B1
.word 0x09F001B2
.word 0x09E001B3
.word 0x09C001B3
.word 0x09A001B4
.word 0x099001B5
.word 0x097001B6
.word 0x096001B7
.word 0x094001B7
.word 0x092001B8
.word 0x091001B9
.word 0x08F001BA
.word 0x08D001BB
.word 0x08C001BB
.word 0x08A001BC
.word 0x089001BD
.word 0x087001BE
.word 0x085001BF
.word 0x084001BF
.word 0x082001C0
.word 0x080001C1
.word 0x07F001C2
.word 0x07D001C3
.word 0x07C001C3
.word 0x07A001C4
.word 0x078001C5
.word 0x077001C6
.word 0x075001C7
.word 0x073001C7
.word 0x072001C8
.word 0x070001C9
.word 0x06F001CA
.word 0x06D001CB
.word 0x06B001CB
.word 0x06A001CC
.word 0x068001CD
.word 0x066001CE
.word 0x065001CF
.word 0x063001CF
.word 0x062001D0
.word 0x060001D1
.word 0x05E001D2
.word 0x05D001D3
.word 0x05B001D3
.word 0x059001D4
.word 0x058001D5
.word 0x056001D6
.word 0x055001D7
.word 0x053001D7
.word 0x051001D8
.word 0x050001D9
.word 0x04E001DA
.word 0x04C001DA
.word 0x04B001DB
.word 0x049001DC
.word 0x048001DD
.word 0x046001DE
.word 0x044001DE
.word 0x043001DF
.word 0x041001E0
.word 0x03F001E1
.word 0x03E001E2
.word 0x03C001E2
.word 0x03B001E3
.word 0x039001E4
.word 0x037001E5
.word 0x036001E6
.word 0x034001E6
.word 0x032001E7
.word 0x031001E8
.word 0x02F001E9
.word 0x02E001EA
.word 0x02C001EA
.word 0x02A001EB
.word 0x029001EC
.word 0x027001ED
.word 0x025001EE
.word 0x024001EE
.word 0x022001EF
.word 0x021001F0
.word 0x01F001F1
.word 0x01D001F2
.word 0x01C001F2
.word 0x01A001F3
.word 0x018001F4
.word 0x017001F5
.word 0x015001F6
.word 0x014001F6
.word 0x012001F7
.word 0x010001F8
.word 0x00F001F9
.word 0x00D001FA
.word 0x00B001FA
.word 0x00A001FB
.word 0x008001FC
.word 0x007001FD
.word 0x005001FE
.word 0x003001FE
.word 0x002001FF
.word 0x00000200
.word 0xFFE00201
.word 0xFFD00202
.word 0xFFB00202
.word 0xFF900203
.word 0xFF800204
.word 0xFF600205
.word 0xFF500206
.word 0xFF300206
.word 0xFF100207
.word 0xFF000208
.word 0xFEE00209
.word 0xFEC0020A
.word 0xFEB0020A
.word 0xFE90020B
.word 0xFE80020C
.word 0xFE60020D
.word 0xFE40020E
.word 0xFE30020E
.word 0xFE10020F
.word 0xFDF00210
.word 0xFDE00211
.word 0xFDC00212
.word 0xFDB00212
.word 0xFD900213
.word 0xFD700214
.word 0xFD600215
.word 0xFD400216
.word 0xFD200216
.word 0xFD100217
.word 0xFCF00218
.word 0xFCE00219
.word 0xFCC0021A
.word 0xFCA0021A
.word 0xFC90021B
.word 0xFC70021C
.word 0xFC50021D
.word 0xFC40021E
.word 0xFC20021E
.word 0xFC10021F
.word 0xFBF00220
.word 0xFBD00221
.word 0xFBC00222
.word 0xFBA00222
.word 0xFB800223
.word 0xFB700224
.word 0xFB500225
.word 0xFB400226
.word 0xFB200226
.word 0xFB000227
.word 0xFAF00228
.word 0xFAD00229
.word 0xFAB00229
.word 0xFAA0022A
.word 0xFA80022B
.word 0xFA70022C
.word 0xFA50022D
.word 0xFA30022D
.word 0xFA20022E
.word 0xFA00022F
.word 0xF9E00230
.word 0xF9D00231
.word 0xF9B00231
.word 0xF9A00232
.word 0xF9800233
.word 0xF9600234
.word 0xF9500235
.word 0xF9300235
.word 0xF9100236
.word 0xF9000237
.word 0xF8E00238
.word 0xF8D00239
.word 0xF8B00239
.word 0xF890023A
.word 0xF880023B
.word 0xF860023C
.word 0xF840023D
.word 0xF830023D
.word 0xF810023E
.word 0xF800023F
.word 0xF7E00240
.word 0xF7C00241
.word 0xF7B00241
.word 0xF7900242
.word 0xF7700243
.word 0xF7600244
.word 0xF7400245
.word 0xF7300245
.word 0xF7100246
.word 0xF6F00247
.word 0xF6E00248
.word 0xF6C00249
.word 0xF6A00249
.word 0xF690024A
.word 0xF670024B
.word 0xF660024C
.word 0xF640024D
.word 0xF620024D
.word 0xF610024E
.word 0xF5F0024F
.word 0xF5D00250
.word 0xF5C00251
.word 0xF5A00251
.word 0xF5900252
.word 0xF5700253
.word 0xF5500254
.word 0xF5400255
.word 0xF5200255
.word 0xF5000256
.word 0xF4F00257
.word 0xF4D00258
.word 0xF4C00259
.word 0xF4A00259
.word 0xF480025A
.word 0xF470025B
.word 0xF450025C
.word 0xF430025D
.word 0xF420025D
.word 0xF400025E
.word 0xF3F0025F
.word 0xF3D00260
.word 0xF3B00261
.word 0xF3A00261
.word 0xF3800262
.word 0xF3600263
.word 0xF3500264
.word 0xF3300265
.word 0xF3100265

This trades maths operations for table lookups - which may or may not be a good thing on the NDS. There may well be more speedups to be had; for instance, rather than reading pairs of Y pixels from subsequent rows, using LDRBs we may be better reading adjacent ones with LDRH and masking.

This code could be faster still if only the "Y+B" components could be kept below 512. This can be acheived by bending the tables slightly. I might experiment with that later.

I'm off on holiday on Wednesday, so if I go silent, please don't assume that it's disinterest.

Robin

Tables provide a convenient way to use up otherwise unused DTCM :)
_________________
"We are merely sprites that dance at the beck and call of our button pressing overlord."

RobinWatts wrote:

chishm wrote:

Has anyone achieved something faster?

This code could be faster still if only the "Y+B" components could be kept below 512. This can be acheived by bending the tables slightly. I might experiment with that later.

Second attempt, using the slightly bent tables. I defy anyone to tell the difference visually :)

Code:

@ YUV-> RGB conversion code Copyright (C) 2008 Robin Watts (robin@wss.co.uk).
@
@ Licensed under the GPL. If you need it under another license, contact me
@ and ask.
@
@ This program is free software ; you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation ; either version 2 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY ; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this program ; if not, write to the Free Software
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@
@
@ The algorithm used here is based heavily on one created by Sophie Wilson
@ of Acorn/e-14/Broadcomm. Many thanks.
@
@ Additional tweaks (in the fast fixup code) are from Paul Gardiner.
@
@ The old implementation of YUV -> RGB did:
@
@ R = CLAMP((Y-16)*1.164 + 1.596*V)
@ G = CLAMP((Y-16)*1.164 - 0.391*U - 0.813*V)
@ B = CLAMP((Y-16)*1.164 + 2.018*U )
@
@ We're going to bend that here as follows:
@
@ R = CLAMP(y + 1.596*V)
@ G = CLAMP(y - 0.383*U - 0.813*V)
@ B = CLAMP(y + 1.976*U )
@
@ where y = 0 for Y <= 16,
@ y = ( Y-16)*1.164, for 16 < Y <= 239,
@ y = (239-16)*1.164, for 239 < Y
@
@ i.e. We clamp Y to the 16 to 239 range (which it is supposed to be in
@ anyway). We then pick the B_U factor so that B never exceeds 511. We then
@ shrink the G_U factor in line with that to avoid a colour shift as much as
@ possible.
@
@ We're going to use tables to do it faster, but rather than doing it using
@ 5 tables as as the above suggests, we're going to do it using just 3.
@
@ We do this by working in parallel within a 32 bit word, and using one
@ table each for Y U and V.
@
@ Source Y values are 0 to 255, so 0.. 260 after scaling
@ Source U values are -128 to 127, so -49.. 49(G), -253..251(B) after
@ Source V values are -128 to 127, so -204..203(R), -104..103(G) after
@
@ So total summed values:
@ -223 <= R <= 481, -173 <= G <= 431, -253 <= B < 511
@
@ We need to pack R G and B into a 32 bit word, and because of Bs range we
@ need 2 bits above the valid range of B to detect overflow, and another one
@ to detect the sense of the overflow. We therefore adopt the following
@ representation:
@
@ osGGGGGgggggosBBBBBbbbosRRRRRrrr
@
@ Each such word breaks down into 3 ranges.
@
@ osGGGGGggggg osBBBBBbbb osRRRRRrrr
@
@ Thus we have 8 bits for each B and R table entry, and 10 bits for G (good
@ as G is the most noticable one). The s bit for each represents the sign,
@ and o represents the overflow.
@
@ For R and B we pack the table by taking the 11 bit representation of their
@ values, and toggling bit 10 in the U and V tables.
@
@ For the green case we calculate 4*G (thus effectively using 10 bits for the
@ valid range) truncate to 12 bits. We toggle bit 11 in the Y table.

.equ UV_STRIDE, 192
.equ Y_STRIDE, 384
.equ X_STRIDE, 512
.equ WIDTH,    256

.align
.global yv12_to_rgb555_asm
.func yv12_to_rgb555_asm

@ void yv12_to_rgb555_asm
@ uint8_t * x_ptr
@ uint8_t * y_ptr
@ uint8_t * u_ptr
@ uint8_t * v_ptr
@ int height

yv12_to_rgb555_asm:

@ r0 = x_ptr
@ r1 = y_ptr
@ r2 = u_ptr
@ r3 = v_ptr
@ <> = height
STMFD r13!,{r4-r11,r14}

ADR r8, y_table
LDRB r14,[r13,#9*4]    @ r14 = height
ADD r9, r8,#1024 @ u_table
ADD r10,r9,#1024 @ v_table
LDR r4, =0x07C07C1F
LDR r5, =0x40040100
yloop:
SUB r14,r14,#WIDTH<<8 @ r14 = height-(WIDTH<<8)
xloop:
LDRB r11,[r2],#1    @ r11 = u = *u_ptr++
LDRB r12,[r3],#1    @ r12 = v = *v_ptr++
LDRB r7, [r1, #Y_STRIDE] @ r7 = y2 = y_ptr[stride]
LDRB r6, [r1],#1    @ r6 = y0 = *y_ptr++
LDR r11,[r9, r11,LSL #2] @ r11 = u = u_table[u]
LDR r12,[r10,r12,LSL #2] @ r12 = v = v_table[v]
LDR r7, [r8, r7, LSL #2] @ r7 = y2 = y_table[y2]
LDR r6, [r8, r6, LSL #2] @ r6 = y0 = y_table[y0]
ADD r11,r11,r12    @ r11 = uv = u+v

ADD r7, r7,r11    @ r7 = y0 + uv
ADD r6, r6,r11    @ r6 = y2 + uv
ANDS r12, r7, r5
TSTEQ r6, r5
BNE fix01
return01:
AND r7, r4, r7, LSR #3
AND r6, r4, r6, LSR #3
ORR r7, r7, r7, LSR #17
ORR r6, r6, r6, LSR #17
ADD r12,r0, #X_STRIDE
STRH r7, [r12]
LDRB r12,[r1, #Y_STRIDE] @ r12 = y3 = y_ptr[stride]
LDRB r7, [r1],#1    @ r6 = y1 = *y_ptr++
STRH r6, [r0],#2

LDR r12,[r8, r12,LSL #2] @ r7 = y3 = y_table[y2]
LDR r6, [r8, r7, LSL #2] @ r6 = y1 = y_table[y0]

ADD r7, r12,r11    @ r7 = y3 + uv
ADD r6, r6, r11    @ r6 = y1 + uv
ANDS r12, r7, r5
TSTEQ r6, r5
BNE fix02
return02:
AND r7, r4, r7, LSR #3
AND r6, r4, r6, LSR #3
ORR r7, r7, r7, LSR #17
ORR r6, r6, r6, LSR #17
ADD r12,r0, #X_STRIDE
STRH r7, [r12]
STRH r6, [r0],#2
ADDS r14,r14,#2<<8
BLT xloop

ADD r0, r0, #2*X_STRIDE - 2*WIDTH @ x_ptr to next line
ADD r1, r1, #2*Y_STRIDE - WIDTH @ y_ptr to next line
ADD r2, r2, #UV_STRIDE - (WIDTH/2)
ADD r3, r3, #UV_STRIDE - (WIDTH/2)

SUBS r14,r14,#2
BGT yloop

LDMFD r13!,{r4-r11, pc}
fix01:
@ r7 and r6 are the values, at least one of which has overflowed
@ r12 = r7 & mask = .s......s......s......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS.SSSSSS.SSSSSS
ORR r7, r7, r12    @ r7 |= ..SSSSSS.SSSSSS.SSSSSS
BIC r12,r5, r7, LSR #1 @ r12 = .o......o......o......
ADD r7, r7, r12,LSR #8 @ r7 = fixed value

AND r12, r6, r5    @ r12 = .S......S......S......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS.SSSSSS.SSSSSS
ORR r6, r6, r12    @ r6 |= ..SSSSSS.SSSSSS.SSSSSS
BIC r12,r5, r6, LSR #1 @ r12 = .o......o......o......
ADD r6, r6, r12,LSR #8 @ r6 = fixed value
B return01
fix02:
@ r7 and r6 are the values, at least one of which has overflowed
@ r12 = r7 & mask = .s......s......s......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS.SSSSSS.SSSSSS
ORR r7, r7, r12    @ r7 |= ..SSSSSS.SSSSSS.SSSSSS
BIC r12,r5, r7, LSR #1 @ r12 = .o......o......o......
ADD r7, r7, r12,LSR #8 @ r7 = fixed value

AND r12, r6, r5    @ r12 = .S......S......S......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS..SSSSS.SSSSSS
ORR r6, r6, r12    @ r6 |= ..SSSSSS..SSSSS.SSSSSS
BIC r12,r5, r6, LSR #1 @ r12 = .o......o......o......
ADD r6, r6, r12,LSR #8 @ r6 = fixed value
B return02
.endfunc
.pool

y_table:
.word 0x7FFFFFED
.word 0x7FFFFFEF
.word 0x7FFFFFF0
.word 0x7FFFFFF1
.word 0x7FFFFFF2
.word 0x7FFFFFF3
.word 0x7FFFFFF4
.word 0x7FFFFFF6
.word 0x7FFFFFF7
.word 0x7FFFFFF8
.word 0x7FFFFFF9
.word 0x7FFFFFFA
.word 0x7FFFFFFB
.word 0x7FFFFFFD
.word 0x7FFFFFFE
.word 0x7FFFFFFF
.word 0x80000000
.word 0x80500401
.word 0x80900802
.word 0x80E00C03
.word 0x81301405
.word 0x81701806
.word 0x81C01C07
.word 0x82102008
.word 0x82502409
.word 0x82A0280A
.word 0x82F0300C
.word 0x8330340D
.word 0x8380380E
.word 0x83D03C0F
.word 0x84104010
.word 0x84604411
.word 0x84A04C13
.word 0x84F05014
.word 0x85405415
.word 0x85805816
.word 0x85D05C17
.word 0x86206018
.word 0x8660681A
.word 0x86B06C1B
.word 0x8700701C
.word 0x8740741D
.word 0x8790781E
.word 0x87E07C1F
.word 0x88208421
.word 0x88708822
.word 0x88C08C23
.word 0x89009024
.word 0x89509425
.word 0x89A09826
.word 0x89E0A028
.word 0x8A30A429
.word 0x8A80A82A
.word 0x8AC0AC2B
.word 0x8B10B02C
.word 0x8B60B42D
.word 0x8BA0BC2F
.word 0x8BF0C030
.word 0x8C40C431
.word 0x8C80C832
.word 0x8CD0CC33
.word 0x8D20D034
.word 0x8D60D836
.word 0x8DB0DC37
.word 0x8DF0E038
.word 0x8E40E439
.word 0x8E90E83A
.word 0x8ED0EC3B
.word 0x8F20F43D
.word 0x8F70F83E
.word 0x8FB0FC3F
.word 0x90010040
.word 0x90510441
.word 0x90910842
.word 0x90E11044
.word 0x91311445
.word 0x91711846
.word 0x91C11C47
.word 0x92112048
.word 0x92512449
.word 0x92A1284A
.word 0x92F1304C
.word 0x9331344D
.word 0x9381384E
.word 0x93D13C4F
.word 0x94114050
.word 0x94614451
.word 0x94B14C53
.word 0x94F15054
.word 0x95415455
.word 0x95915856
.word 0x95D15C57
.word 0x96216058
.word 0x9671685A
.word 0x96B16C5B
.word 0x9701705C
.word 0x9741745D
.word 0x9791785E
.word 0x97E17C5F
.word 0x98218461
.word 0x98718862
.word 0x98C18C63
.word 0x99019064
.word 0x99519465
.word 0x99A19866
.word 0x99E1A068
.word 0x9A31A469
.word 0x9A81A86A
.word 0x9AC1AC6B
.word 0x9B11B06C
.word 0x9B61B46D
.word 0x9BA1BC6F
.word 0x9BF1C070
.word 0x9C41C471
.word 0x9C81C872
.word 0x9CD1CC73
.word 0x9D21D074
.word 0x9D61D876
.word 0x9DB1DC77
.word 0x9E01E078
.word 0x9E41E479
.word 0x9E91E87A
.word 0x9EE1EC7B
.word 0x9F21F47D
.word 0x9F71F87E
.word 0x9FC1FC7F
.word 0xA0020080
.word 0xA0520481
.word 0xA0920882
.word 0xA0E21084
.word 0xA1321485
.word 0xA1721886
.word 0xA1C21C87
.word 0xA2122088
.word 0xA2522489
.word 0xA2A22C8B
.word 0xA2F2308C
.word 0xA332348D
.word 0xA382388E
.word 0xA3D23C8F
.word 0xA4124090
.word 0xA4624892
.word 0xA4B24C93
.word 0xA4F25094
.word 0xA5425495
.word 0xA5925896
.word 0xA5D25C97
.word 0xA6226098
.word 0xA672689A
.word 0xA6B26C9B
.word 0xA702709C
.word 0xA752749D
.word 0xA792789E
.word 0xA7E27C9F
.word 0xA83284A1
.word 0xA87288A2
.word 0xA8C28CA3
.word 0xA90290A4
.word 0xA95294A5
.word 0xA9A298A6
.word 0xA9E2A0A8
.word 0xAA32A4A9
.word 0xAA82A8AA
.word 0xAAC2ACAB
.word 0xAB12B0AC
.word 0xAB62B4AD
.word 0xABA2BCAF
.word 0xABF2C0B0
.word 0xAC42C4B1
.word 0xAC82C8B2
.word 0xACD2CCB3
.word 0xAD22D0B4
.word 0xAD62D8B6
.word 0xADB2DCB7
.word 0xAE02E0B8
.word 0xAE42E4B9
.word 0xAE92E8BA
.word 0xAEE2ECBB
.word 0xAF22F4BD
.word 0xAF72F8BE
.word 0xAFC2FCBF
.word 0xB00300C0
.word 0xB05304C1
.word 0xB0A308C2
.word 0xB0E310C4
.word 0xB13314C5
.word 0xB18318C6
.word 0xB1C31CC7
.word 0xB21320C8
.word 0xB25324C9
.word 0xB2A32CCB
.word 0xB2F330CC
.word 0xB33334CD
.word 0xB38338CE
.word 0xB3D33CCF
.word 0xB41340D0
.word 0xB46348D2
.word 0xB4B34CD3
.word 0xB4F350D4
.word 0xB54354D5
.word 0xB59358D6
.word 0xB5D35CD7
.word 0xB62364D9
.word 0xB67368DA
.word 0xB6B36CDB
.word 0xB70370DC
.word 0xB75374DD
.word 0xB79378DE
.word 0xB7E37CDF
.word 0xB83384E1
.word 0xB87388E2
.word 0xB8C38CE3
.word 0xB91390E4
.word 0xB95394E5
.word 0xB9A398E6
.word 0xB9F3A0E8
.word 0xBA33A4E9
.word 0xBA83A8EA
.word 0xBAD3ACEB
.word 0xBB13B0EC
.word 0xBB63B4ED
.word 0xBBA3BCEF
.word 0xBBF3C0F0
.word 0xBC43C4F1
.word 0xBC83C8F2
.word 0xBCD3CCF3
.word 0xBD23D0F4
.word 0xBD63D8F6
.word 0xBDB3DCF7
.word 0xBE03E0F8
.word 0xBE43E4F9
.word 0xBE93E8FA
.word 0xBEE3ECFB
.word 0xBF23F4FD
.word 0xBF73F8FE
.word 0xBFC3FCFF
.word 0xC0040100
.word 0xC0540501
.word 0xC0A40902
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
u_table:
.word 0x0C440C00
.word 0x0C341400
.word 0x0C141C00
.word 0x0C042400
.word 0x0BE42C00
.word 0x0BC43400
.word 0x0BB43C00
.word 0x0B944400
.word 0x0B844C00
.word 0x0B645400
.word 0x0B545C00
.word 0x0B346400
.word 0x0B246C00
.word 0x0B047400
.word 0x0AF47C00
.word 0x0AD48400
.word 0x0AC48C00
.word 0x0AA49400
.word 0x0A949C00
.word 0x0A74A400
.word 0x0A54AC00
.word 0x0A44B400
.word 0x0A24BC00
.word 0x0A14C400
.word 0x09F4C800
.word 0x09E4D000
.word 0x09C4D800
.word 0x09B4E000
.word 0x0994E800
.word 0x0984F000
.word 0x0964F800
.word 0x09550000
.word 0x09350800
.word 0x09251000
.word 0x09051800
.word 0x08E52000
.word 0x08D52800
.word 0x08B53000
.word 0x08A53800
.word 0x08854000
.word 0x08754800
.word 0x08555000
.word 0x08455800
.word 0x08256000
.word 0x08156800
.word 0x07F57000
.word 0x07E57800
.word 0x07C58000
.word 0x07B58800
.word 0x07959000
.word 0x07759800
.word 0x0765A000
.word 0x0745A800
.word 0x0735B000
.word 0x0715B800
.word 0x0705C000
.word 0x06E5C800
.word 0x06D5D000
.word 0x06B5D800
.word 0x06A5E000
.word 0x0685E800
.word 0x0675F000
.word 0x0655F800
.word 0x06460000
.word 0x06260800
.word 0x06161000
.word 0x05F61400
.word 0x05D61C00
.word 0x05C62400
.word 0x05A62C00
.word 0x05963400
.word 0x05763C00
.word 0x05664400
.word 0x05464C00
.word 0x05365400
.word 0x05165C00
.word 0x05066400
.word 0x04E66C00
.word 0x04D67400
.word 0x04B67C00
.word 0x04A68400
.word 0x04868C00
.word 0x04669400
.word 0x04569C00
.word 0x0436A400
.word 0x0426AC00
.word 0x0406B400
.word 0x03F6BC00
.word 0x03D6C400
.word 0x03C6CC00
.word 0x03A6D400
.word 0x0396DC00
.word 0x0376E400
.word 0x0366EC00
.word 0x0346F400
.word 0x0336FC00
.word 0x03170400
.word 0x02F70C00
.word 0x02E71400
.word 0x02C71C00
.word 0x02B72400
.word 0x02972C00
.word 0x02873400
.word 0x02673C00
.word 0x02574400
.word 0x02374C00
.word 0x02275400
.word 0x02075C00
.word 0x01F76000
.word 0x01D76800
.word 0x01C77000
.word 0x01A77800
.word 0x01978000
.word 0x01778800
.word 0x01579000
.word 0x01479800
.word 0x0127A000
.word 0x0117A800
.word 0x00F7B000
.word 0x00E7B800
.word 0x00C7C000
.word 0x00B7C800
.word 0x0097D000
.word 0x0087D800
.word 0x0067E000
.word 0x0057E800
.word 0x0037F000
.word 0x0027F800
.word 0x00080000
.word 0xFFE80800
.word 0xFFD81000
.word 0xFFB81800
.word 0xFFA82000
.word 0xFF882800
.word 0xFF783000
.word 0xFF583800
.word 0xFF484000
.word 0xFF284800
.word 0xFF185000
.word 0xFEF85800
.word 0xFEE86000
.word 0xFEC86800
.word 0xFEB87000
.word 0xFE987800
.word 0xFE788000
.word 0xFE688800
.word 0xFE489000
.word 0xFE389800
.word 0xFE18A000
.word 0xFE08A400
.word 0xFDE8AC00
.word 0xFDD8B400
.word 0xFDB8BC00
.word 0xFDA8C400
.word 0xFD88CC00
.word 0xFD78D400
.word 0xFD58DC00
.word 0xFD48E400
.word 0xFD28EC00
.word 0xFD18F400
.word 0xFCF8FC00
.word 0xFCD90400
.word 0xFCC90C00
.word 0xFCA91400
.word 0xFC991C00
.word 0xFC792400
.word 0xFC692C00
.word 0xFC493400
.word 0xFC393C00
.word 0xFC194400
.word 0xFC094C00
.word 0xFBE95400
.word 0xFBD95C00
.word 0xFBB96400
.word 0xFBA96C00
.word 0xFB897400
.word 0xFB697C00
.word 0xFB598400
.word 0xFB398C00
.word 0xFB299400
.word 0xFB099C00
.word 0xFAF9A400
.word 0xFAD9AC00
.word 0xFAC9B400
.word 0xFAA9BC00
.word 0xFA99C400
.word 0xFA79CC00
.word 0xFA69D400
.word 0xFA49DC00
.word 0xFA39E400
.word 0xFA19EC00
.word 0xF9F9F000
.word 0xF9E9F800
.word 0xF9CA0000
.word 0xF9BA0800
.word 0xF99A1000
.word 0xF98A1800
.word 0xF96A2000
.word 0xF95A2800
.word 0xF93A3000
.word 0xF92A3800
.word 0xF90A4000
.word 0xF8FA4800
.word 0xF8DA5000
.word 0xF8CA5800
.word 0xF8AA6000
.word 0xF89A6800
.word 0xF87A7000
.word 0xF85A7800
.word 0xF84A8000
.word 0xF82A8800
.word 0xF81A9000
.word 0xF7FA9800
.word 0xF7EAA000
.word 0xF7CAA800
.word 0xF7BAB000
.word 0xF79AB800
.word 0xF78AC000
.word 0xF76AC800
.word 0xF75AD000
.word 0xF73AD800
.word 0xF72AE000
.word 0xF70AE800
.word 0xF6EAF000
.word 0xF6DAF800
.word 0xF6BB0000
.word 0xF6AB0800
.word 0xF68B1000
.word 0xF67B1800
.word 0xF65B2000
.word 0xF64B2800
.word 0xF62B3000
.word 0xF61B3800
.word 0xF5FB3C00
.word 0xF5EB4400
.word 0xF5CB4C00
.word 0xF5BB5400
.word 0xF59B5C00
.word 0xF57B6400
.word 0xF56B6C00
.word 0xF54B7400
.word 0xF53B7C00
.word 0xF51B8400
.word 0xF50B8C00
.word 0xF4EB9400
.word 0xF4DB9C00
.word 0xF4BBA400
.word 0xF4ABAC00
.word 0xF48BB400
.word 0xF47BBC00
.word 0xF45BC400
.word 0xF44BCC00
.word 0xF42BD400
.word 0xF41BDC00
.word 0xF3FBE400
.word 0xF3DBEC00
v_table:
.word 0x1A000134
.word 0x19D00135
.word 0x19A00137
.word 0x19700139
.word 0x1930013A
.word 0x1900013C
.word 0x18D0013D
.word 0x1890013F
.word 0x18600140
.word 0x18300142
.word 0x18000144
.word 0x17C00145
.word 0x17900147
.word 0x17600148
.word 0x1730014A
.word 0x16F0014C
.word 0x16C0014D
.word 0x1690014F
.word 0x16600150
.word 0x16200152
.word 0x15F00154
.word 0x15C00155
.word 0x15900157
.word 0x15500158
.word 0x1520015A
.word 0x14F0015C
.word 0x14C0015D
.word 0x1480015F
.word 0x14500160
.word 0x14200162
.word 0x13F00164
.word 0x13B00165
.word 0x13800167
.word 0x13500168
.word 0x1320016A
.word 0x12E0016C
.word 0x12B0016D
.word 0x1280016F
.word 0x12500170
.word 0x12100172
.word 0x11E00174
.word 0x11B00175
.word 0x11800177
.word 0x11400178
.word 0x1110017A
.word 0x10E0017C
.word 0x10B0017D
.word 0x1070017F
.word 0x10400180
.word 0x10100182
.word 0x0FE00184
.word 0x0FA00185
.word 0x0F700187
.word 0x0F400188
.word 0x0F10018A
.word 0x0ED0018B
.word 0x0EA0018D
.word 0x0E70018F
.word 0x0E400190
.word 0x0E000192
.word 0x0DD00193
.word 0x0DA00195
.word 0x0D700197
.word 0x0D300198
.word 0x0D00019A
.word 0x0CD0019B
.word 0x0CA0019D
.word 0x0C60019F
.word 0x0C3001A0
.word 0x0C0001A2
.word 0x0BD001A3
.word 0x0B9001A5
.word 0x0B6001A7
.word 0x0B3001A8
.word 0x0B0001AA
.word 0x0AC001AB
.word 0x0A9001AD
.word 0x0A6001AF
.word 0x0A3001B0
.word 0x09F001B2
.word 0x09C001B3
.word 0x099001B5
.word 0x096001B7
.word 0x092001B8
.word 0x08F001BA
.word 0x08C001BB
.word 0x089001BD
.word 0x085001BF
.word 0x082001C0
.word 0x07F001C2
.word 0x07C001C3
.word 0x078001C5
.word 0x075001C7
.word 0x072001C8
.word 0x06F001CA
.word 0x06B001CB
.word 0x068001CD
.word 0x065001CF
.word 0x062001D0
.word 0x05E001D2
.word 0x05B001D3
.word 0x058001D5
.word 0x055001D7
.word 0x051001D8
.word 0x04E001DA
.word 0x04B001DB
.word 0x048001DD
.word 0x044001DE
.word 0x041001E0
.word 0x03E001E2
.word 0x03B001E3
.word 0x037001E5
.word 0x034001E6
.word 0x031001E8
.word 0x02E001EA
.word 0x02A001EB
.word 0x027001ED
.word 0x024001EE
.word 0x021001F0
.word 0x01D001F2
.word 0x01A001F3
.word 0x017001F5
.word 0x014001F6
.word 0x010001F8
.word 0x00D001FA
.word 0x00A001FB
.word 0x007001FD
.word 0x003001FE
.word 0x00000200
.word 0xFFD00202
.word 0xFF900203
.word 0xFF600205
.word 0xFF300206
.word 0xFF000208
.word 0xFEC0020A
.word 0xFE90020B
.word 0xFE60020D
.word 0xFE30020E
.word 0xFDF00210
.word 0xFDC00212
.word 0xFD900213
.word 0xFD600215
.word 0xFD200216
.word 0xFCF00218
.word 0xFCC0021A
.word 0xFC90021B
.word 0xFC50021D
.word 0xFC20021E
.word 0xFBF00220
.word 0xFBC00222
.word 0xFB800223
.word 0xFB500225
.word 0xFB200226
.word 0xFAF00228
.word 0xFAB00229
.word 0xFA80022B
.word 0xFA50022D
.word 0xFA20022E
.word 0xF9E00230
.word 0xF9B00231
.word 0xF9800233
.word 0xF9500235
.word 0xF9100236
.word 0xF8E00238
.word 0xF8B00239
.word 0xF880023B
.word 0xF840023D
.word 0xF810023E
.word 0xF7E00240
.word 0xF7B00241
.word 0xF7700243
.word 0xF7400245
.word 0xF7100246
.word 0xF6E00248
.word 0xF6A00249
.word 0xF670024B
.word 0xF640024D
.word 0xF610024E
.word 0xF5D00250
.word 0xF5A00251
.word 0xF5700253
.word 0xF5400255
.word 0xF5000256
.word 0xF4D00258
.word 0xF4A00259
.word 0xF470025B
.word 0xF430025D
.word 0xF400025E
.word 0xF3D00260
.word 0xF3A00261
.word 0xF3600263
.word 0xF3300265
.word 0xF3000266
.word 0xF2D00268
.word 0xF2900269
.word 0xF260026B
.word 0xF230026D
.word 0xF200026E
.word 0xF1C00270
.word 0xF1900271
.word 0xF1600273
.word 0xF1300275
.word 0xF0F00276
.word 0xF0C00278
.word 0xF0900279
.word 0xF060027B
.word 0xF020027C
.word 0xEFF0027E
.word 0xEFC00280
.word 0xEF900281
.word 0xEF500283
.word 0xEF200284
.word 0xEEF00286
.word 0xEEC00288
.word 0xEE800289
.word 0xEE50028B
.word 0xEE20028C
.word 0xEDF0028E
.word 0xEDB00290
.word 0xED800291
.word 0xED500293
.word 0xED200294
.word 0xECE00296
.word 0xECB00298
.word 0xEC800299
.word 0xEC50029B
.word 0xEC10029C
.word 0xEBE0029E
.word 0xEBB002A0
.word 0xEB8002A1
.word 0xEB4002A3
.word 0xEB1002A4
.word 0xEAE002A6
.word 0xEAB002A8
.word 0xEA7002A9
.word 0xEA4002AB
.word 0xEA1002AC
.word 0xE9E002AE
.word 0xE9A002B0
.word 0xE97002B1
.word 0xE94002B3
.word 0xE91002B4
.word 0xE8D002B6
.word 0xE8A002B8
.word 0xE87002B9
.word 0xE84002BB
.word 0xE80002BC
.word 0xE7D002BE
.word 0xE7A002C0
.word 0xE77002C1
.word 0xE73002C3
.word 0xE70002C4
.word 0xE6D002C6
.word 0xE6A002C8
.word 0xE66002C9
.word 0xE63002CB

Robin

Third attempt, based on v2, but this time dithering the output. Looks much nicer, IMHO.

Code:

@ YUV-> RGB conversion code Copyright (C) 2008 Robin Watts (robin@wss.co.uk).
@
@ Licensed under the GPL. If you need it under another license, contact me
@ and ask.
@
@ This program is free software ; you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation ; either version 2 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY ; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this program ; if not, write to the Free Software
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@
@
@ The algorithm used here is based heavily on one created by Sophie Wilson
@ of Acorn/e-14/Broadcomm. Many thanks.
@
@ Additional tweaks (in the fast fixup code) are from Paul Gardiner.
@
@ The old implementation of YUV -> RGB did:
@
@ R = CLAMP((Y-16)*1.164 + 1.596*V)
@ G = CLAMP((Y-16)*1.164 - 0.391*U - 0.813*V)
@ B = CLAMP((Y-16)*1.164 + 2.018*U )
@
@ We're going to bend that here as follows:
@
@ R = CLAMP(y + 1.596*V)
@ G = CLAMP(y - 0.383*U - 0.813*V)
@ B = CLAMP(y + 1.976*U )
@
@ where y = 0 for Y <= 16,
@ y = ( Y-16)*1.164, for 16 < Y <= 239,
@ y = (239-16)*1.164, for 239 < Y
@
@ i.e. We clamp Y to the 16 to 239 range (which it is supposed to be in
@ anyway). We then pick the B_U factor so that B never exceeds 511. We then
@ shrink the G_U factor in line with that to avoid a colour shift as much as
@ possible.
@
@ We're going to use tables to do it faster, but rather than doing it using
@ 5 tables as as the above suggests, we're going to do it using just 3.
@
@ We do this by working in parallel within a 32 bit word, and using one
@ table each for Y U and V.
@
@ Source Y values are 0 to 255, so 0.. 260 after scaling
@ Source U values are -128 to 127, so -49.. 49(G), -253..251(B) after
@ Source V values are -128 to 127, so -204..203(R), -104..103(G) after
@
@ So total summed values:
@ -223 <= R <= 481, -173 <= G <= 431, -253 <= B < 511
@
@ We need to pack R G and B into a 32 bit word, and because of Bs range we
@ need 2 bits above the valid range of B to detect overflow, and another one
@ to detect the sense of the overflow. We therefore adopt the following
@ representation:
@
@ osGGGGGgggggosBBBBBbbbosRRRRRrrr
@
@ Each such word breaks down into 3 ranges.
@
@ osGGGGGggggg osBBBBBbbb osRRRRRrrr
@
@ Thus we have 8 bits for each B and R table entry, and 10 bits for G (good
@ as G is the most noticable one). The s bit for each represents the sign,
@ and o represents the overflow.
@
@ For R and B we pack the table by taking the 11 bit representation of their
@ values, and toggling bit 10 in the U and V tables.
@
@ For the green case we calculate 4*G (thus effectively using 10 bits for the
@ valid range) truncate to 12 bits. We toggle bit 11 in the Y table.

.equ UV_STRIDE, 192
.equ Y_STRIDE, 384
.equ X_STRIDE, 512
.equ WIDTH,    256

.align
.global yv12_to_rgb555_asm
.func yv12_to_rgb555_asm

@ void yv12_to_rgb555_asm
@ uint8_t * x_ptr
@ uint8_t * y_ptr
@ uint8_t * u_ptr
@ uint8_t * v_ptr
@ int height

yv12_to_rgb555_asm:

@ r0 = x_ptr
@ r1 = y_ptr
@ r2 = u_ptr
@ r3 = v_ptr
@ <> = height
STMFD r13!,{r4-r11,r14}

ADR r8, y_table
LDRB r14,[r13,#9*4]    @ r14 = height
ADD r9, r8,#1024 @ u_table
ADD r10,r9,#1024 @ v_table
LDR r4, =0x07C07C1F
LDR r5, =0x40040100
yloop:
SUB r14,r14,#WIDTH<<8 @ r14 = height-(WIDTH<<8)
xloop:
LDRB r11,[r2],#1    @ r11 = u = *u_ptr++
LDRB r12,[r3],#1    @ r12 = v = *v_ptr++
LDRB r7, [r1, #Y_STRIDE] @ r7 = y2 = y_ptr[stride]
LDRB r6, [r1],#1    @ r6 = y0 = *y_ptr++
LDR r11,[r9, r11,LSL #2] @ r11 = u = u_table[u]
LDR r12,[r10,r12,LSL #2] @ r12 = v = v_table[v]
LDR r7, [r8, r7, LSL #2] @ r7 = y2 = y_table[y2]
LDR r6, [r8, r6, LSL #2] @ r6 = y0 = y_table[y0]
ADD r11,r11,r12    @ r11 = uv = u+v

ADD r12,r11,r5, LSR #8
ADD r7, r7,r12    @ r7 = y0 + uv
ADD r6, r6,r12    @ r6 = y2 + uv
ADD r6, r6, r5, LSR #7
ANDS r12, r7, r5
TSTEQ r6, r5
BNE fix01
return01:
AND r7, r4, r7, LSR #3
AND r6, r4, r6, LSR #3
ORR r7, r7, r7, LSR #17
ORR r6, r6, r6, LSR #17
ADD r12,r0, #X_STRIDE
STRH r7, [r12]
LDRB r12,[r1, #Y_STRIDE] @ r12 = y3 = y_ptr[stride]
LDRB r7, [r1],#1    @ r6 = y1 = *y_ptr++
STRH r6, [r0],#2

LDR r12,[r8, r12,LSL #2] @ r7 = y3 = y_table[y2]
LDR r6, [r8, r7, LSL #2] @ r6 = y1 = y_table[y0]

ADD r7, r12,r11    @ r7 = y3 + uv
ADD r6, r6, r11    @ r6 = y1 + uv
ADD r7, r7, r5, LSR #7
ANDS r12, r7, r5
TSTEQ r6, r5
BNE fix02
return02:
AND r7, r4, r7, LSR #3
AND r6, r4, r6, LSR #3
ORR r7, r7, r7, LSR #17
ORR r6, r6, r6, LSR #17
ADD r12,r0, #X_STRIDE
STRH r7, [r12]
STRH r6, [r0],#2
ADDS r14,r14,#2<<8
BLT xloop

ADD r0, r0, #2*X_STRIDE - 2*WIDTH @ x_ptr to next line
ADD r1, r1, #2*Y_STRIDE - WIDTH @ y_ptr to next line
ADD r2, r2, #UV_STRIDE - (WIDTH/2)
ADD r3, r3, #UV_STRIDE - (WIDTH/2)

SUBS r14,r14,#2
BGT yloop

LDMFD r13!,{r4-r11, pc}
fix01:
@ r7 and r6 are the values, at least one of which has overflowed
@ r12 = r7 & mask = .s......s......s......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS.SSSSSS.SSSSSS
ORR r7, r7, r12    @ r7 |= ..SSSSSS.SSSSSS.SSSSSS
BIC r12,r5, r7, LSR #1 @ r12 = .o......o......o......
ADD r7, r7, r12,LSR #8 @ r7 = fixed value

AND r12, r6, r5    @ r12 = .S......S......S......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS.SSSSSS.SSSSSS
ORR r6, r6, r12    @ r6 |= ..SSSSSS.SSSSSS.SSSSSS
BIC r12,r5, r6, LSR #1 @ r12 = .o......o......o......
ADD r6, r6, r12,LSR #8 @ r6 = fixed value
B return01
fix02:
@ r7 and r6 are the values, at least one of which has overflowed
@ r12 = r7 & mask = .s......s......s......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS.SSSSSS.SSSSSS
ORR r7, r7, r12    @ r7 |= ..SSSSSS.SSSSSS.SSSSSS
BIC r12,r5, r7, LSR #1 @ r12 = .o......o......o......
ADD r7, r7, r12,LSR #8 @ r7 = fixed value

AND r12, r6, r5    @ r12 = .S......S......S......
SUB r12,r12,r12,LSR #8 @ r12 = ..SSSSSS..SSSSS.SSSSSS
ORR r6, r6, r12    @ r6 |= ..SSSSSS..SSSSS.SSSSSS
BIC r12,r5, r6, LSR #1 @ r12 = .o......o......o......
ADD r6, r6, r12,LSR #8 @ r6 = fixed value
B return02
.endfunc
.pool

y_table:
.word 0x7FFFFFED
.word 0x7FFFFFEF
.word 0x7FFFFFF0
.word 0x7FFFFFF1
.word 0x7FFFFFF2
.word 0x7FFFFFF3
.word 0x7FFFFFF4
.word 0x7FFFFFF6
.word 0x7FFFFFF7
.word 0x7FFFFFF8
.word 0x7FFFFFF9
.word 0x7FFFFFFA
.word 0x7FFFFFFB
.word 0x7FFFFFFD
.word 0x7FFFFFFE
.word 0x7FFFFFFF
.word 0x80000000
.word 0x80500401
.word 0x80900802
.word 0x80E00C03
.word 0x81301405
.word 0x81701806
.word 0x81C01C07
.word 0x82102008
.word 0x82502409
.word 0x82A0280A
.word 0x82F0300C
.word 0x8330340D
.word 0x8380380E
.word 0x83D03C0F
.word 0x84104010
.word 0x84604411
.word 0x84A04C13
.word 0x84F05014
.word 0x85405415
.word 0x85805816
.word 0x85D05C17
.word 0x86206018
.word 0x8660681A
.word 0x86B06C1B
.word 0x8700701C
.word 0x8740741D
.word 0x8790781E
.word 0x87E07C1F
.word 0x88208421
.word 0x88708822
.word 0x88C08C23
.word 0x89009024
.word 0x89509425
.word 0x89A09826
.word 0x89E0A028
.word 0x8A30A429
.word 0x8A80A82A
.word 0x8AC0AC2B
.word 0x8B10B02C
.word 0x8B60B42D
.word 0x8BA0BC2F
.word 0x8BF0C030
.word 0x8C40C431
.word 0x8C80C832
.word 0x8CD0CC33
.word 0x8D20D034
.word 0x8D60D836
.word 0x8DB0DC37
.word 0x8DF0E038
.word 0x8E40E439
.word 0x8E90E83A
.word 0x8ED0EC3B
.word 0x8F20F43D
.word 0x8F70F83E
.word 0x8FB0FC3F
.word 0x90010040
.word 0x90510441
.word 0x90910842
.word 0x90E11044
.word 0x91311445
.word 0x91711846
.word 0x91C11C47
.word 0x92112048
.word 0x92512449
.word 0x92A1284A
.word 0x92F1304C
.word 0x9331344D
.word 0x9381384E
.word 0x93D13C4F
.word 0x94114050
.word 0x94614451
.word 0x94B14C53
.word 0x94F15054
.word 0x95415455
.word 0x95915856
.word 0x95D15C57
.word 0x96216058
.word 0x9671685A
.word 0x96B16C5B
.word 0x9701705C
.word 0x9741745D
.word 0x9791785E
.word 0x97E17C5F
.word 0x98218461
.word 0x98718862
.word 0x98C18C63
.word 0x99019064
.word 0x99519465
.word 0x99A19866
.word 0x99E1A068
.word 0x9A31A469
.word 0x9A81A86A
.word 0x9AC1AC6B
.word 0x9B11B06C
.word 0x9B61B46D
.word 0x9BA1BC6F
.word 0x9BF1C070
.word 0x9C41C471
.word 0x9C81C872
.word 0x9CD1CC73
.word 0x9D21D074
.word 0x9D61D876
.word 0x9DB1DC77
.word 0x9E01E078
.word 0x9E41E479
.word 0x9E91E87A
.word 0x9EE1EC7B
.word 0x9F21F47D
.word 0x9F71F87E
.word 0x9FC1FC7F
.word 0xA0020080
.word 0xA0520481
.word 0xA0920882
.word 0xA0E21084
.word 0xA1321485
.word 0xA1721886
.word 0xA1C21C87
.word 0xA2122088
.word 0xA2522489
.word 0xA2A22C8B
.word 0xA2F2308C
.word 0xA332348D
.word 0xA382388E
.word 0xA3D23C8F
.word 0xA4124090
.word 0xA4624892
.word 0xA4B24C93
.word 0xA4F25094
.word 0xA5425495
.word 0xA5925896
.word 0xA5D25C97
.word 0xA6226098
.word 0xA672689A
.word 0xA6B26C9B
.word 0xA702709C
.word 0xA752749D
.word 0xA792789E
.word 0xA7E27C9F
.word 0xA83284A1
.word 0xA87288A2
.word 0xA8C28CA3
.word 0xA90290A4
.word 0xA95294A5
.word 0xA9A298A6
.word 0xA9E2A0A8
.word 0xAA32A4A9
.word 0xAA82A8AA
.word 0xAAC2ACAB
.word 0xAB12B0AC
.word 0xAB62B4AD
.word 0xABA2BCAF
.word 0xABF2C0B0
.word 0xAC42C4B1
.word 0xAC82C8B2
.word 0xACD2CCB3
.word 0xAD22D0B4
.word 0xAD62D8B6
.word 0xADB2DCB7
.word 0xAE02E0B8
.word 0xAE42E4B9
.word 0xAE92E8BA
.word 0xAEE2ECBB
.word 0xAF22F4BD
.word 0xAF72F8BE
.word 0xAFC2FCBF
.word 0xB00300C0
.word 0xB05304C1
.word 0xB0A308C2
.word 0xB0E310C4
.word 0xB13314C5
.word 0xB18318C6
.word 0xB1C31CC7
.word 0xB21320C8
.word 0xB25324C9
.word 0xB2A32CCB
.word 0xB2F330CC
.word 0xB33334CD
.word 0xB38338CE
.word 0xB3D33CCF
.word 0xB41340D0
.word 0xB46348D2
.word 0xB4B34CD3
.word 0xB4F350D4
.word 0xB54354D5
.word 0xB59358D6
.word 0xB5D35CD7
.word 0xB62364D9
.word 0xB67368DA
.word 0xB6B36CDB
.word 0xB70370DC
.word 0xB75374DD
.word 0xB79378DE
.word 0xB7E37CDF
.word 0xB83384E1
.word 0xB87388E2
.word 0xB8C38CE3
.word 0xB91390E4
.word 0xB95394E5
.word 0xB9A398E6
.word 0xB9F3A0E8
.word 0xBA33A4E9
.word 0xBA83A8EA
.word 0xBAD3ACEB
.word 0xBB13B0EC
.word 0xBB63B4ED
.word 0xBBA3BCEF
.word 0xBBF3C0F0
.word 0xBC43C4F1
.word 0xBC83C8F2
.word 0xBCD3CCF3
.word 0xBD23D0F4
.word 0xBD63D8F6
.word 0xBDB3DCF7
.word 0xBE03E0F8
.word 0xBE43E4F9
.word 0xBE93E8FA
.word 0xBEE3ECFB
.word 0xBF23F4FD
.word 0xBF73F8FE
.word 0xBFC3FCFF
.word 0xC0040100
.word 0xC0540501
.word 0xC0A40902
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
.word 0xC0E41104
u_table:
.word 0x0C440C00
.word 0x0C341400
.word 0x0C141C00
.word 0x0C042400
.word 0x0BE42C00
.word 0x0BC43400
.word 0x0BB43C00
.word 0x0B944400
.word 0x0B844C00
.word 0x0B645400
.word 0x0B545C00
.word 0x0B346400
.word 0x0B246C00
.word 0x0B047400
.word 0x0AF47C00
.word 0x0AD48400
.word 0x0AC48C00
.word 0x0AA49400
.word 0x0A949C00
.word 0x0A74A400
.word 0x0A54AC00
.word 0x0A44B400
.word 0x0A24BC00
.word 0x0A14C400
.word 0x09F4C800
.word 0x09E4D000
.word 0x09C4D800
.word 0x09B4E000
.word 0x0994E800
.word 0x0984F000
.word 0x0964F800
.word 0x09550000
.word 0x09350800
.word 0x09251000
.word 0x09051800
.word 0x08E52000
.word 0x08D52800
.word 0x08B53000
.word 0x08A53800
.word 0x08854000
.word 0x08754800
.word 0x08555000
.word 0x08455800
.word 0x08256000
.word 0x08156800
.word 0x07F57000
.word 0x07E57800
.word 0x07C58000
.word 0x07B58800
.word 0x07959000
.word 0x07759800
.word 0x0765A000
.word 0x0745A800
.word 0x0735B000
.word 0x0715B800
.word 0x0705C000
.word 0x06E5C800
.word 0x06D5D000
.word 0x06B5D800
.word 0x06A5E000
.word 0x0685E800
.word 0x0675F000
.word 0x0655F800
.word 0x06460000
.word 0x06260800
.word 0x06161000
.word 0x05F61400
.word 0x05D61C00
.word 0x05C62400
.word 0x05A62C00
.word 0x05963400
.word 0x05763C00
.word 0x05664400
.word 0x05464C00
.word 0x05365400
.word 0x05165C00
.word 0x05066400
.word 0x04E66C00
.word 0x04D67400
.word 0x04B67C00
.word 0x04A68400
.word 0x04868C00
.word 0x04669400
.word 0x04569C00
.word 0x0436A400
.word 0x0426AC00
.word 0x0406B400
.word 0x03F6BC00
.word 0x03D6C400
.word 0x03C6CC00
.word 0x03A6D400
.word 0x0396DC00
.word 0x0376E400
.word 0x0366EC00
.word 0x0346F400
.word 0x0336FC00
.word 0x03170400
.word 0x02F70C00
.word 0x02E71400
.word 0x02C71C00
.word 0x02B72400
.word 0x02972C00
.word 0x02873400
.word 0x02673C00
.word 0x02574400
.word 0x02374C00
.word 0x02275400
.word 0x02075C00
.word 0x01F76000
.word 0x01D76800
.word 0x01C77000
.word 0x01A77800
.word 0x01978000
.word 0x01778800
.word 0x01579000
.word 0x01479800
.word 0x0127A000
.word 0x0117A800
.word 0x00F7B000
.word 0x00E7B800
.word 0x00C7C000
.word 0x00B7C800
.word 0x0097D000
.word 0x0087D800
.word 0x0067E000
.word 0x0057E800
.word 0x0037F000
.word 0x0027F800
.word 0x00080000
.word 0xFFE80800
.word 0xFFD81000
.word 0xFFB81800
.word 0xFFA82000
.word 0xFF882800
.word 0xFF783000
.word 0xFF583800
.word 0xFF484000
.word 0xFF284800
.word 0xFF185000
.word 0xFEF85800
.word 0xFEE86000
.word 0xFEC86800
.word 0xFEB87000
.word 0xFE987800
.word 0xFE788000
.word 0xFE688800
.word 0xFE489000
.word 0xFE389800
.word 0xFE18A000
.word 0xFE08A400
.word 0xFDE8AC00
.word 0xFDD8B400
.word 0xFDB8BC00
.word 0xFDA8C400
.word 0xFD88CC00
.word 0xFD78D400
.word 0xFD58DC00
.word 0xFD48E400
.word 0xFD28EC00
.word 0xFD18F400
.word 0xFCF8FC00
.word 0xFCD90400
.word 0xFCC90C00
.word 0xFCA91400
.word 0xFC991C00
.word 0xFC792400
.word 0xFC692C00
.word 0xFC493400
.word 0xFC393C00
.word 0xFC194400
.word 0xFC094C00
.word 0xFBE95400
.word 0xFBD95C00
.word 0xFBB96400
.word 0xFBA96C00
.word 0xFB897400
.word 0xFB697C00
.word 0xFB598400
.word 0xFB398C00
.word 0xFB299400
.word 0xFB099C00
.word 0xFAF9A400
.word 0xFAD9AC00
.word 0xFAC9B400
.word 0xFAA9BC00
.word 0xFA99C400
.word 0xFA79CC00
.word 0xFA69D400
.word 0xFA49DC00
.word 0xFA39E400
.word 0xFA19EC00
.word 0xF9F9F000
.word 0xF9E9F800
.word 0xF9CA0000
.word 0xF9BA0800
.word 0xF99A1000
.word 0xF98A1800
.word 0xF96A2000
.word 0xF95A2800
.word 0xF93A3000
.word 0xF92A3800
.word 0xF90A4000
.word 0xF8FA4800
.word 0xF8DA5000
.word 0xF8CA5800
.word 0xF8AA6000
.word 0xF89A6800
.word 0xF87A7000
.word 0xF85A7800
.word 0xF84A8000
.word 0xF82A8800
.word 0xF81A9000
.word 0xF7FA9800
.word 0xF7EAA000
.word 0xF7CAA800
.word 0xF7BAB000
.word 0xF79AB800
.word 0xF78AC000
.word 0xF76AC800
.word 0xF75AD000
.word 0xF73AD800
.word 0xF72AE000
.word 0xF70AE800
.word 0xF6EAF000
.word 0xF6DAF800
.word 0xF6BB0000
.word 0xF6AB0800
.word 0xF68B1000
.word 0xF67B1800
.word 0xF65B2000
.word 0xF64B2800
.word 0xF62B3000
.word 0xF61B3800
.word 0xF5FB3C00
.word 0xF5EB4400
.word 0xF5CB4C00
.word 0xF5BB5400
.word 0xF59B5C00
.word 0xF57B6400
.word 0xF56B6C00
.word 0xF54B7400
.word 0xF53B7C00
.word 0xF51B8400
.word 0xF50B8C00
.word 0xF4EB9400
.word 0xF4DB9C00
.word 0xF4BBA400
.word 0xF4ABAC00
.word 0xF48BB400
.word 0xF47BBC00
.word 0xF45BC400
.word 0xF44BCC00
.word 0xF42BD400
.word 0xF41BDC00
.word 0xF3FBE400
.word 0xF3DBEC00
v_table:
.word 0x1A000134
.word 0x19D00135
.word 0x19A00137
.word 0x19700139
.word 0x1930013A
.word 0x1900013C
.word 0x18D0013D
.word 0x1890013F
.word 0x18600140
.word 0x18300142
.word 0x18000144
.word 0x17C00145
.word 0x17900147
.word 0x17600148
.word 0x1730014A
.word 0x16F0014C
.word 0x16C0014D
.word 0x1690014F
.word 0x16600150
.word 0x16200152
.word 0x15F00154
.word 0x15C00155
.word 0x15900157
.word 0x15500158
.word 0x1520015A
.word 0x14F0015C
.word 0x14C0015D
.word 0x1480015F
.word 0x14500160
.word 0x14200162
.word 0x13F00164
.word 0x13B00165
.word 0x13800167
.word 0x13500168
.word 0x1320016A
.word 0x12E0016C
.word 0x12B0016D
.word 0x1280016F
.word 0x12500170
.word 0x12100172
.word 0x11E00174
.word 0x11B00175
.word 0x11800177
.word 0x11400178
.word 0x1110017A
.word 0x10E0017C
.word 0x10B0017D
.word 0x1070017F
.word 0x10400180
.word 0x10100182
.word 0x0FE00184
.word 0x0FA00185
.word 0x0F700187
.word 0x0F400188
.word 0x0F10018A
.word 0x0ED0018B
.word 0x0EA0018D
.word 0x0E70018F
.word 0x0E400190
.word 0x0E000192
.word 0x0DD00193
.word 0x0DA00195
.word 0x0D700197
.word 0x0D300198
.word 0x0D00019A
.word 0x0CD0019B
.word 0x0CA0019D
.word 0x0C60019F
.word 0x0C3001A0
.word 0x0C0001A2
.word 0x0BD001A3
.word 0x0B9001A5
.word 0x0B6001A7
.word 0x0B3001A8
.word 0x0B0001AA
.word 0x0AC001AB
.word 0x0A9001AD
.word 0x0A6001AF
.word 0x0A3001B0
.word 0x09F001B2
.word 0x09C001B3
.word 0x099001B5
.word 0x096001B7
.word 0x092001B8
.word 0x08F001BA
.word 0x08C001BB
.word 0x089001BD
.word 0x085001BF
.word 0x082001C0
.word 0x07F001C2
.word 0x07C001C3
.word 0x078001C5
.word 0x075001C7
.word 0x072001C8
.word 0x06F001CA
.word 0x06B001CB
.word 0x068001CD
.word 0x065001CF
.word 0x062001D0
.word 0x05E001D2
.word 0x05B001D3
.word 0x058001D5
.word 0x055001D7
.word 0x051001D8
.word 0x04E001DA
.word 0x04B001DB
.word 0x048001DD
.word 0x044001DE
.word 0x041001E0
.word 0x03E001E2
.word 0x03B001E3
.word 0x037001E5
.word 0x034001E6
.word 0x031001E8
.word 0x02E001EA
.word 0x02A001EB
.word 0x027001ED
.word 0x024001EE
.word 0x021001F0
.word 0x01D001F2
.word 0x01A001F3
.word 0x017001F5
.word 0x014001F6
.word 0x010001F8
.word 0x00D001FA
.word 0x00A001FB
.word 0x007001FD
.word 0x003001FE
.word 0x00000200
.word 0xFFD00202
.word 0xFF900203
.word 0xFF600205
.word 0xFF300206
.word 0xFF000208
.word 0xFEC0020A
.word 0xFE90020B
.word 0xFE60020D
.word 0xFE30020E
.word 0xFDF00210
.word 0xFDC00212
.word 0xFD900213
.word 0xFD600215
.word 0xFD200216
.word 0xFCF00218
.word 0xFCC0021A
.word 0xFC90021B
.word 0xFC50021D
.word 0xFC20021E
.word 0xFBF00220
.word 0xFBC00222
.word 0xFB800223
.word 0xFB500225
.word 0xFB200226
.word 0xFAF00228
.word 0xFAB00229
.word 0xFA80022B
.word 0xFA50022D
.word 0xFA20022E
.word 0xF9E00230
.word 0xF9B00231
.word 0xF9800233
.word 0xF9500235
.word 0xF9100236
.word 0xF8E00238
.word 0xF8B00239
.word 0xF880023B
.word 0xF840023D
.word 0xF810023E
.word 0xF7E00240
.word 0xF7B00241
.word 0xF7700243
.word 0xF7400245
.word 0xF7100246
.word 0xF6E00248
.word 0xF6A00249
.word 0xF670024B
.word 0xF640024D
.word 0xF610024E
.word 0xF5D00250
.word 0xF5A00251
.word 0xF5700253
.word 0xF5400255
.word 0xF5000256
.word 0xF4D00258
.word 0xF4A00259
.word 0xF470025B
.word 0xF430025D
.word 0xF400025E
.word 0xF3D00260
.word 0xF3A00261
.word 0xF3600263
.word 0xF3300265
.word 0xF3000266
.word 0xF2D00268
.word 0xF2900269
.word 0xF260026B
.word 0xF230026D
.word 0xF200026E
.word 0xF1C00270
.word 0xF1900271
.word 0xF1600273
.word 0xF1300275
.word 0xF0F00276
.word 0xF0C00278
.word 0xF0900279
.word 0xF060027B
.word 0xF020027C
.word 0xEFF0027E
.word 0xEFC00280
.word 0xEF900281
.word 0xEF500283
.word 0xEF200284
.word 0xEEF00286
.word 0xEEC00288
.word 0xEE800289
.word 0xEE50028B
.word 0xEE20028C
.word 0xEDF0028E
.word 0xEDB00290
.word 0xED800291
.word 0xED500293
.word 0xED200294
.word 0xECE00296
.word 0xECB00298
.word 0xEC800299
.word 0xEC50029B
.word 0xEC10029C
.word 0xEBE0029E
.word 0xEBB002A0
.word 0xEB8002A1
.word 0xEB4002A3
.word 0xEB1002A4
.word 0xEAE002A6
.word 0xEAB002A8
.word 0xEA7002A9
.word 0xEA4002AB
.word 0xEA1002AC
.word 0xE9E002AE
.word 0xE9A002B0
.word 0xE97002B1
.word 0xE94002B3
.word 0xE91002B4
.word 0xE8D002B6
.word 0xE8A002B8
.word 0xE87002B9
.word 0xE84002BB
.word 0xE80002BC
.word 0xE7D002BE
.word 0xE7A002C0
.word 0xE77002C1
.word 0xE73002C3
.word 0xE70002C4
.word 0xE6D002C6
.word 0xE6A002C8
.word 0xE66002C9
.word 0xE63002CB

Robin

I have measured Chishm's and Robin's last code.
Setup was: 256x192, Y,U and V buffers were set to start at 0x02000000 to have some pseudorandom (opcodes) but constant data, output buffer was a 4 byte aligned malloced block

Tested 100 runs and middled the measurement. (33MHz cycles)

Chishm's: 1 279 959
Robin's: 393 571

Both measurements include the very same calling instructions (Push & copy of arguments, pop afterwards)

( http://www.speedshare.org/download.php?id=3A61C0113 )

x3 is pretty dang good :)

I wonder how much of a performance impact this this code would have on tuna-viDS.

Maxxie wrote:

I have measured Chishm's and Robin's last code.
Setup was: 256x192, Y,U and V buffers were set to start at 0x02000000 to have some pseudorandom (opcodes) but constant data, output buffer was a 4 byte aligned malloced block

To get a true timing for my code you should really test it with real world data, as the timings will vary according to what proportion of the values over/underflow (normally < 10%).

Quote:

Tested 100 runs and middled the measurement. (33MHz cycles)

Chishm's: 1 279 959
Robin's: 393 571

Both measurements include the very same calling instructions (Push & copy of arguments, pop afterwards)

That's brilliant - exactly the kind of thing to make me smile on my holiday.

Doom5 wrote:

I wonder how much of a performance impact this this code would have on tuna-viDS.

Dunno what proportion of the runtime this takes. I had a quick look through the source before I came away, and there is definate scope for more ARM coding of sections of the code. (No disrespect to Chishm here - he's done the hard part in making it work at all!)

Any hints on profiling the code to find out where I should be spending my time would be much appreciated.

I'll try to look into it some more when I get back.

TTFN,

Robin (In a hotel business centre in Ecuador)

RobinWatts wrote:

Quote:

Tested 100 runs and middled the measurement. (33MHz cycles)

Chishm's: 1 279 959
Robin's: 393 571

Both measurements include the very same calling instructions (Push & copy of arguments, pop afterwards)

That's brilliant - exactly the kind of thing to make me smile on my holiday.

Doom5 wrote:

I wonder how much of a performance impact this this code would have on tuna-viDS.

Dunno what proportion of the runtime this takes.

There are 560,190 ARM7 cycles between vblanks. A 12 fps video takes 5 vblanks or 2,800,950 cycles to present a new frame. So Chishm's YUV decoder takes 45.7% CPU while yours takes 14.1%.

Another angle: If we aren't trying to necessarily be compatible with unconverted video downloaded from the Internet, wouldn't it be faster to decode video that has been encoded using Y = (R + G) / 2 instead of the standard formula, which is closer to Y = (3R + 6G + B) / 10?
_________________
-- Where is he?
-- Who?
-- You know, the human.
-- I think he moved to Tilwick.

Cool, I'll have to see if your code will allow me to consistently get > 15FPS with libmpeg2.

RobinWatts wrote:

Maxxie wrote:

I have measured Chishm's and Robin's last code.
Setup was: 256x192, Y,U and V buffers were set to start at 0x02000000 to have some pseudorandom (opcodes) but constant data, output buffer was a 4 byte aligned malloced block

To get a true timing for my code you should really test it with real world data, as the timings will vary according to what proportion of the values over/underflow (normally < 10%).

Yes i know, i had no YUV image at hand. If you got a testscreen i'd be happy to use it instead, and have the functions operate to output to a vram bank (That will reduce the speed of bost measurements)

tepples wrote:

Another angle: If we aren't trying to necessarily be compatible with unconverted video downloaded from the Internet, wouldn't it be faster to decode video that has been encoded using Y = (R + G) / 2 instead of the standard formula, which is closer to Y = (3R + 6G + B) / 10?

Nah, the exact weightings used are buried in the tables, so it'd just be different values.

(Unless it caused a significant difference in the number of values that over/under flowed).

There is scope, if we put the table generation code into the app, to get brightness/contrast/colour/gamma adjustments for free.

Lazy1 wrote:

Cool, I'll have to see if your code will allow me to consistently get > 15FPS with libmpeg2.

I encoded various things at 12.5fps, and they seem fine. Didn't have time to experiment with different bitrates etc :(

If LibMPEG2 uses different weights for Cr/Cb etc, let me know what they are, and I'll generate proper tables when I get home.

Maxxie wrote:

Yes i know, i had no YUV image at hand. If you got a testscreen i'd be happy to use it instead, and have the functions operate to output to a vram bank (That will reduce the speed of both measurements)

I don't have one to hand... but I'll try and generate one when I get home (I'll try and save out a frame or two from the conversion code).

Robin

RobinWatts wrote:

Nah, the exact weightings used are buried in the tables, so it'd just be different values.

(Unless it caused a significant difference in the number of values that over/under flowed).

For certain matrices you could not be needing tables at all. For instance, early codecs like Cinepack use a different yuv->rgb conversion scheme

| r | | 1.0 0.0 2.0 | | y |
| g | = | 1.0 -0.5 -1.0 | | u |
| b | | 1.0 2.0 0.0 | | v |

Taking the inverse of the 3x3 matrix gives you:

| y | | 0.2857 0.5714 0.1429 | | r |
| u | = | -0.1429 -0.2857 0.4286 | | g |
| v | | 0.3571 -0.2857 -0.0714 | | b |

The first matrix is preatty easy to handle, as the RGB values can be computed from the YUV by means of shifts and sums.
Quality can be debatable to purists in certain environments (although Cinepack has been succesful for a decade on our PCs), but sticking to this conversion scheme produces no visible degradation on DS screen IMHO. Of course this requires video to be encoded with this custom rgb->yuv conversion process, which forces you to write your own encoder.
Nevertheless, after some quick tests I've done, even with video which has been created with the classic RGB->YUV conversion matrix, converting back to RGB with the above color scheme instead of the classic one, still gives you "acceptable" results.

It's yet to check if a valid asm implementation of this method could beat Robin's impressive piece of code though.

tonysavon wrote:

For certain matrices you could not be needing tables at all. For instance, early codecs like Cinepack use a different yuv->rgb conversion scheme

| r | | 1.0 0.0 2.0 | | y |
| g | = | 1.0 -0.5 -1.0 | | u |
| b | | 1.0 2.0 0.0 | | v |

Really? I don?t remember that from my dealings with cinepak, but I?ll take your word for it.

Quote:

The first matrix is pretty easy to handle, as the RGB values can be computed from the YUV by means of shifts and sums.

Sure. So you?d:

Load U
Load V
minus_guv_up1 = (V>>1)-U
Load Y0
R0 = Y0 +(V<<1)
G0 = Y0 - (minus_guv_up1>>1)
B0 = Y0 +(U<<1)
CLAMP(R0)
CLAMP(G0)
CLAMP(B0)
COMBINE(R0,G0,B0)
store Y0
(rinse and repeat for Y1, Y2, Y3)

Each line of the above corresponds 1 ARM instruction (more in the case of CLAMP and COMBINE)

So thats 3 + 4*5 = 23 instructions, plus clamping and combining, for each group of 4 Y?s.

Let?s be generous and assume we can do CLAMPS in 1 and COMBINEs in 2, and that?s another 4*5 = 20 cycles, so 43 overall. (Actually, that?s probably insanely generous).

(If we can find a way to do all the maths in the top bytes of the words, we can do the funky saturated maths trick, so the clamps can be done in 2 cycles, but I suspect it?ll cost us another cycle per component to get them up there. Combine takes 5 then. So 4*(9+5) = 56, so 79 overall.)

The real figure is probably somewhere nearer the higher, unless there are some funky ARM5 or later instructions we can use.

The table based approach takes 40ish (3 of which are for dithering) (but that just detects overflows/underflows inline and jumps out to code that fixes it (potentially another 20 instructions, but run only a small percentage of the time).

It?s a very clever algorithm, and one which I can?t take credit for; the idea of sticking everything in the tables, using bit operations to spot the overflow/underflow and jumping out to fix it (and the initial implementation) is down to Sophie Wilson. Paul Gardiner found the fastest way of doing the fixup code (so fast, that if you expect many things to overflow, you may be best just doing the fixup in line every time). I?ve squeezed a couple of cycles here and there, but in all honesty credit lies with Sophie.

The fact that you can just vary the tables to vary the matrix used, as well as brightness/gamma/colour/contrast etc is just icing on the cake.

Robin

RobinWatts:
Very nice! I tried it on my standard test video. The old code (that I posted here) achieved 14.1fps on average. Your code achieved 15.6fps on average. The output doesn't appear noticeably worse either, so I'll be using your colour space converter in the next release.
_________________
http://chishm.drunkencoders.com
http://dldi.drunkencoders.com

chishm wrote:

RobinWatts:
Very nice! I tried it on my standard test video. The old code (that I posted here) achieved 14.1fps on average. Your code achieved 15.6fps on average. The output doesn't appear noticeably worse either, so I'll be using your colour space converter in the next release.

Cool. I'd like to think that the output should actually look better, with the dithering in it?

I'm just about recovered from the jetlag, so I'll try to look into some table generation code this weekend; are you interested in gamma/brightness/contrast/colour controls?

Robin

RobinWatts wrote:

Cool. I'd like to think that the output should actually look better, with the dithering in it?

Most things (like shaded gradients in a cartoon) look better. The only thing I've notice that looks worse is in hard edges, like on text.

RobinWatts wrote:

I'm just about recovered from the jetlag, so I'll try to look into some table generation code this weekend; are you interested in gamma/brightness/contrast/colour controls?

Definitely.
_________________
http://chishm.drunkencoders.com
http://dldi.drunkencoders.com

gbadev.org forum archive

DS development > Fast YUV 4:2:0 -> RGB 15 in ASM

#156728 - chishm - Tue May 13, 2008 12:07 pm

#156730 - Lazy1 - Tue May 13, 2008 12:40 pm

#156910 - HyperHacker - Thu May 15, 2008 11:35 pm

#156918 - chishm - Fri May 16, 2008 1:16 am

#158204 - RobinWatts - Sat Jun 07, 2008 11:48 am

#158210 - Dwedit - Sat Jun 07, 2008 1:35 pm

#158218 - RobinWatts - Sat Jun 07, 2008 5:35 pm

#158220 - RobinWatts - Sat Jun 07, 2008 6:12 pm

#158433 - Maxxie - Wed Jun 11, 2008 2:06 pm

#158449 - Miked0801 - Wed Jun 11, 2008 6:27 pm

#158481 - Doom5 - Thu Jun 12, 2008 3:16 am

#158537 - RobinWatts - Fri Jun 13, 2008 1:09 am

#158541 - tepples - Fri Jun 13, 2008 3:43 am

#158546 - Lazy1 - Fri Jun 13, 2008 5:59 am

#158556 - Maxxie - Fri Jun 13, 2008 12:14 pm

#158559 - RobinWatts - Fri Jun 13, 2008 1:36 pm

#158819 - tonysavon - Wed Jun 18, 2008 5:24 pm

#158828 - RobinWatts - Thu Jun 19, 2008 12:18 am

#159515 - chishm - Wed Jul 02, 2008 4:48 am

#159529 - RobinWatts - Wed Jul 02, 2008 11:40 am

#159551 - chishm - Wed Jul 02, 2008 3:13 pm