[x264-devel] updated mc_chroma_altivec
Manuel Rommel
maaanuuu at gmx.net
Fri Jul 23 21:30:20 CEST 2010
Hello,
the mc_chroma_altivec code was disabled when x264 changed its internal pixel format, so I have updated it to match the C function.
The new code passes checkasm and produces the same output as the C code on a short test clip, but I can’t declare it as thoroughly tested.
I think it could be improved, for example by better instruction ordering inside the loop. Unfortunately, at the moment I don’t have enough time to write and test further code...
Feel free to comment :)
Regards,
Manuel
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 24c539b..2adc3e5 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -263,7 +263,7 @@ static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride,
}
}
-static void mc_chroma_2xh( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
uint8_t *src, int i_src_stride,
int mvx, int mvy,
int i_height )
@@ -277,26 +277,24 @@ static void mc_chroma_2xh( uint8_t *dst, int i_dst_stride,
int cC = (8-d8x)*d8y;
int cD = d8x *d8y;
- src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+ src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
srcp = &src[i_src_stride];
for( int y = 0; y < i_height; y++ )
{
- dst[0] = ( cA*src[0] + cB*src[0+1] + cC*srcp[0] + cD*srcp[0+1] + 32 ) >> 6;
- dst[1] = ( cA*src[1] + cB*src[1+1] + cC*srcp[1] + cD*srcp[1+1] + 32 ) >> 6;
+ dstu[0] = ( cA*src[0] + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6;
+ dstv[0] = ( cA*src[1] + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6;
+ dstu[1] = ( cA*src[2] + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6;
+ dstv[1] = ( cA*src[3] + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6;
src += i_src_stride;
srcp += i_src_stride;
- dst += i_dst_stride;
+ dstu += i_dst_stride;
+ dstv += i_dst_stride;
}
- }
-
-
-#define DO_PROCESS_W4( a ) \
- dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
- dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
+}
-static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
uint8_t *src, int i_src_stride,
int mvx, int mvy,
int i_height )
@@ -311,19 +309,20 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
coeff[2] = (8-d8x)*d8y;
coeff[3] = d8x *d8y;
- src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+ src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
srcp = &src[i_src_stride];
LOAD_ZERO;
PREP_LOAD;
PREP_LOAD_SRC( src );
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
- vec_u8_t src2v_8A, dstv_8A;
- vec_u8_t src2v_8B, dstv_8B;
- vec_u16_t src0v_16A, src1v_16A, src2v_16A, src3v_16A, dstv_16A;
- vec_u16_t src0v_16B, src1v_16B, src2v_16B, src3v_16B, dstv_16B;
+ vec_u8_t src2v_8, dstuv, dstvv;
+ vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
vec_u16_t shiftv, k32v;
+ static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
+ static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
+
coeff0v = vec_ld( 0, coeff );
coeff3v = vec_splat( coeff0v, 3 );
coeff2v = vec_splat( coeff0v, 2 );
@@ -332,51 +331,59 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
shiftv = vec_splat_u16( 6 );
- VEC_LOAD( src, src2v_8B, 5, vec_u8_t, src );
- src2v_16B = vec_u8_to_u16( src2v_8B );
- src3v_16B = vec_sld( src2v_16B, src2v_16B, 2 );
+ VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
+ src2v_16 = vec_u8_to_u16( src2v_8 );
+ src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
for( int y = 0; y < i_height; y += 2 )
{
- src0v_16A = src2v_16B;
- src1v_16A = src3v_16B;
+ src0v_16 = src2v_16;
+ src1v_16 = src3v_16;
+ VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+ src2v_16 = vec_u8_to_u16( src2v_8 );
+ src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+
+ dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
+ dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
+ dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
+ dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
+
+ dstv16 = vec_sr( dstv16, shiftv );
+
+ dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
+ dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
+ vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
+ vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
- VEC_LOAD_G( srcp, src2v_8A, 5, vec_u8_t );
srcp += i_src_stride;
- VEC_LOAD_G( srcp, src2v_8B, 5, vec_u8_t );
+ dstu += i_dst_stride;
+ dstv += i_dst_stride;
+
+ src0v_16 = src2v_16;
+ src1v_16 = src3v_16;
+ VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+ src2v_16 = vec_u8_to_u16( src2v_8 );
+ src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+
+ dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
+ dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
+ dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
+ dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
+
+ dstv16 = vec_sr( dstv16, shiftv );
+
+ dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
+ dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
+ vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
+ vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
+
srcp += i_src_stride;
- src2v_16A = vec_u8_to_u16( src2v_8A );
- src2v_16B = vec_u8_to_u16( src2v_8B );
- src3v_16A = vec_sld( src2v_16A, src2v_16A, 2 );
- src3v_16B = vec_sld( src2v_16B, src2v_16B, 2 );
-
- src0v_16B = src2v_16A;
- src1v_16B = src3v_16A;
-
- dstv_16A = dstv_16B = k32v;
- DO_PROCESS_W4( 0 );
- DO_PROCESS_W4( 1 );
- DO_PROCESS_W4( 2 );
- DO_PROCESS_W4( 3 );
-
- dstv_16A = vec_sr( dstv_16A, shiftv );
- dstv_16B = vec_sr( dstv_16B, shiftv );
- dstv_8A = vec_u16_to_u8( dstv_16A );
- dstv_8B = vec_u16_to_u8( dstv_16B );
- vec_ste( vec_splat( (vec_u32_t) dstv_8A, 0 ), 0, (uint32_t*) dst );
- dst += i_dst_stride;
- vec_ste( vec_splat( (vec_u32_t) dstv_8B, 0 ), 0, (uint32_t*) dst );
- dst += i_dst_stride;
+ dstu += i_dst_stride;
+ dstv += i_dst_stride;
}
}
-#define DO_PROCESS_W8( a ) \
- src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \
- src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \
- dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
- dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
-
-static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
uint8_t *src, int i_src_stride,
int mvx, int mvy,
int i_height )
@@ -391,7 +398,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
coeff[2] = (8-d8x)*d8y;
coeff[3] = d8x *d8y;
- src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+ src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
srcp = &src[i_src_stride];
LOAD_ZERO;
@@ -399,12 +406,15 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
PREP_LOAD_SRC( src );
PREP_STORE8;
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
- vec_u8_t src0v_8A, src1v_8A, src2v_8A, src3v_8A, dstv_8A;
- vec_u8_t src0v_8B, src1v_8B, src2v_8B, src3v_8B, dstv_8B;
- vec_u16_t src0v_16A, src1v_16A, src2v_16A, src3v_16A, dstv_16A;
- vec_u16_t src0v_16B, src1v_16B, src2v_16B, src3v_16B, dstv_16B;
+ vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
+ vec_u8_t dstuv, dstvv;
+ vec_u16_t src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h;
+ vec_u16_t src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l;
vec_u16_t shiftv, k32v;
+ static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
+ static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
+
coeff0v = vec_ld( 0, coeff );
coeff3v = vec_splat( coeff0v, 3 );
coeff2v = vec_splat( coeff0v, 2 );
@@ -413,53 +423,104 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
shiftv = vec_splat_u16( 6 );
- VEC_LOAD( src, src2v_8B, 9, vec_u8_t, src );
- src3v_8B = vec_sld( src2v_8B, src2v_8B, 1 );
+ VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
+ VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
+ src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
- for( int y = 0; y < i_height; y+=2 )
+ for( int y = 0; y < i_height; y += 2 )
{
- src0v_8A = src2v_8B;
- src1v_8A = src3v_8B;
+ src0v_8 = src2v_8;
+ src1v_8 = src3v_8;
+ VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
+ VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+
+ src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+
+ src0v_16h = vec_u8_to_u16_h( src0v_8 );
+ src0v_16l = vec_u8_to_u16_l( src0v_8 );
+ src1v_16h = vec_u8_to_u16_h( src1v_8 );
+ src1v_16l = vec_u8_to_u16_l( src1v_8 );
+ src2v_16h = vec_u8_to_u16_h( src2v_8 );
+ src2v_16l = vec_u8_to_u16_l( src2v_8 );
+ src3v_16h = vec_u8_to_u16_h( src3v_8 );
+ src3v_16l = vec_u8_to_u16_l( src3v_8 );
+
+ dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
+ dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
+ dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
+ dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
+ dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
+ dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
+ dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
+ dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
+
+ dstv_16h = vec_sr( dstv_16h, shiftv );
+ dstv_16l = vec_sr( dstv_16l, shiftv );
+
+ dstuv = vec_perm( dstv_16h, dstv_16l, perm0v );
+ dstvv = vec_perm( dstv_16h, dstv_16l, perm1v );
+
+ VEC_STORE8( dstuv, dstu );
+ VEC_STORE8( dstvv, dstv );
- VEC_LOAD_G( srcp, src2v_8A, 9, vec_u8_t );
srcp += i_src_stride;
- VEC_LOAD_G( srcp, src2v_8B, 9, vec_u8_t );
+ dstu += i_dst_stride;
+ dstv += i_dst_stride;
+
+
+ src0v_8 = src2v_8;
+ src1v_8 = src3v_8;
+ VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
+ VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+
+ src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+
+ src0v_16h = vec_u8_to_u16_h( src0v_8 );
+ src0v_16l = vec_u8_to_u16_l( src0v_8 );
+ src1v_16h = vec_u8_to_u16_h( src1v_8 );
+ src1v_16l = vec_u8_to_u16_l( src1v_8 );
+ src2v_16h = vec_u8_to_u16_h( src2v_8 );
+ src2v_16l = vec_u8_to_u16_l( src2v_8 );
+ src3v_16h = vec_u8_to_u16_h( src3v_8 );
+ src3v_16l = vec_u8_to_u16_l( src3v_8 );
+
+ dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
+ dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
+ dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
+ dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
+ dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
+ dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
+ dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
+ dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
+
+ dstv_16h = vec_sr( dstv_16h, shiftv );
+ dstv_16l = vec_sr( dstv_16l, shiftv );
+
+ dstuv = vec_perm( dstv_16h, dstv_16l, perm0v );
+ dstvv = vec_perm( dstv_16h, dstv_16l, perm1v );
+
+ VEC_STORE8( dstuv, dstu );
+ VEC_STORE8( dstvv, dstv );
+
srcp += i_src_stride;
- src3v_8A = vec_sld( src2v_8A, src2v_8A, 1 );
- src3v_8B = vec_sld( src2v_8B, src2v_8B, 1 );
-
- src0v_8B = src2v_8A;
- src1v_8B = src3v_8A;
- dstv_16A = dstv_16B = k32v;
- DO_PROCESS_W8( 0 );
- DO_PROCESS_W8( 1 );
- DO_PROCESS_W8( 2 );
- DO_PROCESS_W8( 3 );
-
- dstv_16A = vec_sr( dstv_16A, shiftv );
- dstv_16B = vec_sr( dstv_16B, shiftv );
- dstv_8A = vec_u16_to_u8( dstv_16A );
- dstv_8B = vec_u16_to_u8( dstv_16B );
- VEC_STORE8( dstv_8A, dst );
- dst += i_dst_stride;
- VEC_STORE8( dstv_8B, dst );
- dst += i_dst_stride;
+ dstu += i_dst_stride;
+ dstv += i_dst_stride;
}
}
-static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
uint8_t *src, int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
if( i_width == 8 )
- mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
+ mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride,
mvx, mvy, i_height );
else if( i_width == 4 )
- mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
+ mc_chroma_altivec_4xh( dstu, dstv, i_dst_stride, src, i_src_stride,
mvx, mvy, i_height );
else
- mc_chroma_2xh( dst, i_dst_stride, src, i_src_stride,
+ mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride,
mvx, mvy, i_height );
}
@@ -800,7 +861,7 @@ void x264_mc_altivec_init( x264_mc_functions_t *pf )
#if !X264_HIGH_BIT_DEPTH
pf->mc_luma = mc_luma_altivec;
pf->get_ref = get_ref_altivec;
-// pf->mc_chroma = mc_chroma_altivec;
+ pf->mc_chroma = mc_chroma_altivec;
pf->copy_16x16_unaligned = x264_mc_copy_w16_altivec;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_altivec;
More information about the x264-devel
mailing list