[x264-devel] updated mc_chroma_altivec

Fri Jul 23 21:30:20 CEST 2010

Hello,

the mc_chroma_altivec code was disabled when x264 changed its internal pixel format, so I have updated it to match the C function.

The new code passes checkasm and produces the same output as the C code on a short test clip, but I can’t declare it as thoroughly tested.

I think it could be improved, for example by better instruction ordering inside the loop. Unfortunately, at the moment  I don’t have enough time to write and test further code...

Feel free to comment :)


Regards,
Manuel

diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 24c539b..2adc3e5 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -263,7 +263,7 @@ static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
     }
 }
 
-static void mc_chroma_2xh( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                            uint8_t *src, int i_src_stride,
                            int mvx, int mvy,
                            int i_height )
@@ -277,26 +277,24 @@ static void mc_chroma_2xh( uint8_t *dst, int i_dst_stride,
     int cC = (8-d8x)*d8y;
     int cD = d8x    *d8y;
 
-    src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
     srcp = &src[i_src_stride];
 
     for( int y = 0; y < i_height; y++ )
     {
-        dst[0] = ( cA*src[0] +  cB*src[0+1] + cC*srcp[0] + cD*srcp[0+1] + 32 ) >> 6;
-        dst[1] = ( cA*src[1] +  cB*src[1+1] + cC*srcp[1] + cD*srcp[1+1] + 32 ) >> 6;
+        dstu[0] = ( cA*src[0]  + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6;
+        dstv[0] = ( cA*src[1]  + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6;
+        dstu[1] = ( cA*src[2]  + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6;
+        dstv[1] = ( cA*src[3]  + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6;
 
         src  += i_src_stride;
         srcp += i_src_stride;
-        dst  += i_dst_stride;
+        dstu += i_dst_stride;
+        dstv += i_dst_stride;
     }
- }
-
-
-#define DO_PROCESS_W4( a ) \
-    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
-    dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
+}
 
-static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                                    uint8_t *src, int i_src_stride,
                                    int mvx, int mvy,
                                    int i_height )
@@ -311,19 +309,20 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
     coeff[2] = (8-d8x)*d8y;
     coeff[3] = d8x    *d8y;
 
-    src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
     srcp = &src[i_src_stride];
 
     LOAD_ZERO;
     PREP_LOAD;
     PREP_LOAD_SRC( src );
     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
-    vec_u8_t    src2v_8A, dstv_8A;
-    vec_u8_t    src2v_8B, dstv_8B;
-    vec_u16_t   src0v_16A, src1v_16A, src2v_16A, src3v_16A, dstv_16A;
-    vec_u16_t   src0v_16B, src1v_16B, src2v_16B, src3v_16B, dstv_16B;
+    vec_u8_t    src2v_8, dstuv, dstvv;
+    vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
     vec_u16_t   shiftv, k32v;
 
+    static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
+    static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
+
     coeff0v = vec_ld( 0, coeff );
     coeff3v = vec_splat( coeff0v, 3 );
     coeff2v = vec_splat( coeff0v, 2 );
@@ -332,51 +331,59 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
     shiftv  = vec_splat_u16( 6 );
 
-    VEC_LOAD( src, src2v_8B, 5, vec_u8_t, src );
-    src2v_16B = vec_u8_to_u16( src2v_8B );
-    src3v_16B = vec_sld( src2v_16B, src2v_16B, 2 );
+    VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
+    src2v_16 = vec_u8_to_u16( src2v_8 );
+    src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
 
     for( int y = 0; y < i_height; y += 2 )
     {
-        src0v_16A = src2v_16B;
-        src1v_16A = src3v_16B;
+        src0v_16 = src2v_16;
+        src1v_16 = src3v_16;
+        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+        src2v_16 = vec_u8_to_u16( src2v_8 );
+        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+
+        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
+        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
+        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
+        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
+
+        dstv16 = vec_sr( dstv16, shiftv );
+
+        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
+        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
+        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
+        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
 
-        VEC_LOAD_G( srcp, src2v_8A, 5, vec_u8_t );
         srcp += i_src_stride;
-        VEC_LOAD_G( srcp, src2v_8B, 5, vec_u8_t );
+        dstu += i_dst_stride;
+        dstv += i_dst_stride;
+
+        src0v_16 = src2v_16;
+        src1v_16 = src3v_16;
+        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+        src2v_16 = vec_u8_to_u16( src2v_8 );
+        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+
+        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
+        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
+        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
+        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
+
+        dstv16 = vec_sr( dstv16, shiftv );
+
+        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
+        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
+        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
+        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
+
         srcp += i_src_stride;
-        src2v_16A = vec_u8_to_u16( src2v_8A );
-        src2v_16B = vec_u8_to_u16( src2v_8B );
-        src3v_16A = vec_sld( src2v_16A, src2v_16A, 2 );
-        src3v_16B = vec_sld( src2v_16B, src2v_16B, 2 );
-
-        src0v_16B = src2v_16A;
-        src1v_16B = src3v_16A;
-
-        dstv_16A = dstv_16B = k32v;
-        DO_PROCESS_W4( 0 );
-        DO_PROCESS_W4( 1 );
-        DO_PROCESS_W4( 2 );
-        DO_PROCESS_W4( 3 );
-
-        dstv_16A = vec_sr( dstv_16A, shiftv );
-        dstv_16B = vec_sr( dstv_16B, shiftv );
-        dstv_8A  = vec_u16_to_u8( dstv_16A );
-        dstv_8B  = vec_u16_to_u8( dstv_16B );
-        vec_ste( vec_splat( (vec_u32_t) dstv_8A, 0 ), 0, (uint32_t*) dst );
-        dst += i_dst_stride;
-        vec_ste( vec_splat( (vec_u32_t) dstv_8B, 0 ), 0, (uint32_t*) dst );
-        dst += i_dst_stride;
+        dstu += i_dst_stride;
+        dstv += i_dst_stride;
     }
 }
 
-#define DO_PROCESS_W8( a ) \
-    src##a##v_16A = vec_u8_to_u16( src##a##v_8A ); \
-    src##a##v_16B = vec_u8_to_u16( src##a##v_8B ); \
-    dstv_16A = vec_mladd( src##a##v_16A, coeff##a##v, dstv_16A ); \
-    dstv_16B = vec_mladd( src##a##v_16B, coeff##a##v, dstv_16B )
-
-static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                                    uint8_t *src, int i_src_stride,
                                    int mvx, int mvy,
                                    int i_height )
@@ -391,7 +398,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
     coeff[2] = (8-d8x)*d8y;
     coeff[3] = d8x    *d8y;
 
-    src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
     srcp = &src[i_src_stride];
 
     LOAD_ZERO;
@@ -399,12 +406,15 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
     PREP_LOAD_SRC( src );
     PREP_STORE8;
     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
-    vec_u8_t    src0v_8A, src1v_8A, src2v_8A, src3v_8A, dstv_8A;
-    vec_u8_t    src0v_8B, src1v_8B, src2v_8B, src3v_8B, dstv_8B;
-    vec_u16_t   src0v_16A, src1v_16A, src2v_16A, src3v_16A, dstv_16A;
-    vec_u16_t   src0v_16B, src1v_16B, src2v_16B, src3v_16B, dstv_16B;
+    vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
+    vec_u8_t    dstuv, dstvv;
+    vec_u16_t   src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h;
+    vec_u16_t   src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l;
     vec_u16_t   shiftv, k32v;
 
+    static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
+    static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
+
     coeff0v = vec_ld( 0, coeff );
     coeff3v = vec_splat( coeff0v, 3 );
     coeff2v = vec_splat( coeff0v, 2 );
@@ -413,53 +423,104 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
     shiftv  = vec_splat_u16( 6 );
 
-    VEC_LOAD( src, src2v_8B, 9, vec_u8_t, src );
-    src3v_8B = vec_sld( src2v_8B, src2v_8B, 1 );
+    VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
+    VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
+    src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
 
-    for( int y = 0; y < i_height; y+=2 )
+    for( int y = 0; y < i_height; y += 2 )
     {
-        src0v_8A = src2v_8B;
-        src1v_8A = src3v_8B;
+        src0v_8 = src2v_8;
+        src1v_8 = src3v_8;
+        VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
+        VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+
+        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+
+        src0v_16h = vec_u8_to_u16_h( src0v_8 );
+        src0v_16l = vec_u8_to_u16_l( src0v_8 );
+        src1v_16h = vec_u8_to_u16_h( src1v_8 );
+        src1v_16l = vec_u8_to_u16_l( src1v_8 );
+        src2v_16h = vec_u8_to_u16_h( src2v_8 );
+        src2v_16l = vec_u8_to_u16_l( src2v_8 );
+        src3v_16h = vec_u8_to_u16_h( src3v_8 );
+        src3v_16l = vec_u8_to_u16_l( src3v_8 );
+
+        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
+        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
+        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
+        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
+        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
+        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
+        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
+        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
+
+        dstv_16h = vec_sr( dstv_16h, shiftv );
+        dstv_16l = vec_sr( dstv_16l, shiftv );
+
+        dstuv = vec_perm( dstv_16h, dstv_16l, perm0v );
+        dstvv = vec_perm( dstv_16h, dstv_16l, perm1v );
+
+        VEC_STORE8( dstuv, dstu );
+        VEC_STORE8( dstvv, dstv );
 
-        VEC_LOAD_G( srcp, src2v_8A, 9, vec_u8_t );
         srcp += i_src_stride;
-        VEC_LOAD_G( srcp, src2v_8B, 9, vec_u8_t );
+        dstu += i_dst_stride;
+        dstv += i_dst_stride;
+
+
+        src0v_8 = src2v_8;
+        src1v_8 = src3v_8;
+        VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
+        VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+
+        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+
+        src0v_16h = vec_u8_to_u16_h( src0v_8 );
+        src0v_16l = vec_u8_to_u16_l( src0v_8 );
+        src1v_16h = vec_u8_to_u16_h( src1v_8 );
+        src1v_16l = vec_u8_to_u16_l( src1v_8 );
+        src2v_16h = vec_u8_to_u16_h( src2v_8 );
+        src2v_16l = vec_u8_to_u16_l( src2v_8 );
+        src3v_16h = vec_u8_to_u16_h( src3v_8 );
+        src3v_16l = vec_u8_to_u16_l( src3v_8 );
+
+        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
+        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
+        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
+        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
+        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
+        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
+        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
+        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
+
+        dstv_16h = vec_sr( dstv_16h, shiftv );
+        dstv_16l = vec_sr( dstv_16l, shiftv );
+
+        dstuv = vec_perm( dstv_16h, dstv_16l, perm0v );
+        dstvv = vec_perm( dstv_16h, dstv_16l, perm1v );
+
+        VEC_STORE8( dstuv, dstu );
+        VEC_STORE8( dstvv, dstv );
+
         srcp += i_src_stride;
-        src3v_8A = vec_sld( src2v_8A, src2v_8A, 1 );
-        src3v_8B = vec_sld( src2v_8B, src2v_8B, 1 );
-
-        src0v_8B = src2v_8A;
-        src1v_8B = src3v_8A;
-        dstv_16A = dstv_16B = k32v;
-        DO_PROCESS_W8( 0 );
-        DO_PROCESS_W8( 1 );
-        DO_PROCESS_W8( 2 );
-        DO_PROCESS_W8( 3 );
-
-        dstv_16A = vec_sr( dstv_16A, shiftv );
-        dstv_16B = vec_sr( dstv_16B, shiftv );
-        dstv_8A  = vec_u16_to_u8( dstv_16A );
-        dstv_8B  = vec_u16_to_u8( dstv_16B );
-        VEC_STORE8( dstv_8A, dst );
-        dst += i_dst_stride;
-        VEC_STORE8( dstv_8B, dst );
-        dst += i_dst_stride;
+        dstu += i_dst_stride;
+        dstv += i_dst_stride;
     }
 }
 
-static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
                                uint8_t *src, int i_src_stride,
                                int mvx, int mvy,
                                int i_width, int i_height )
 {
     if( i_width == 8 )
-        mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
+        mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride,
                                mvx, mvy, i_height );
     else if( i_width == 4 )
-        mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
+        mc_chroma_altivec_4xh( dstu, dstv, i_dst_stride, src, i_src_stride,
                                mvx, mvy, i_height );
     else
-        mc_chroma_2xh( dst, i_dst_stride, src, i_src_stride,
+        mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride,
                        mvx, mvy, i_height );
 }
 
@@ -800,7 +861,7 @@ void x264_mc_altivec_init( x264_mc_functions_t *pf )
 #if !X264_HIGH_BIT_DEPTH
     pf->mc_luma   = mc_luma_altivec;
     pf->get_ref   = get_ref_altivec;
-//  pf->mc_chroma = mc_chroma_altivec;
+    pf->mc_chroma = mc_chroma_altivec;
 
     pf->copy_16x16_unaligned = x264_mc_copy_w16_altivec;
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_altivec;