[x264-devel] [PATCH] PPC: Improve SATD by using vec_extract and vec_xor.
Luca Barbato
lu_zero at gentoo.org
Sat Apr 6 15:03:08 CEST 2019
On 06/04/2019 14:22, Michail Alvanos wrote:
> Around 1-10% speedup of intra_satd_x3 and satd_ functions by
> using vec_extract instead of vec_splat and vec_ste, and by
> using vec_xor instead of vec_splat_u8 for zeroing the vector.
vec_extract sounds right, vec_xor vs vec_splat sounds fishy.
Could you please split the two changes?
lu
> ---
> common/ppc/pixel.c | 28 ++++++++++++----------------
> common/ppc/ppccommon.h | 2 +-
> 2 files changed, 13 insertions(+), 17 deletions(-)
>
> diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
> index f94029e3..c811d539 100644
> --- a/common/ppc/pixel.c
> +++ b/common/ppc/pixel.c
> @@ -117,6 +117,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
> static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
> vec_s16_t c, vec_s16_t d )
> {
> + LOAD_ZERO;
> vec_s16_t t0 = vec_abs( a );
> vec_s16_t t1 = vec_abs( b );
> vec_s16_t t2 = vec_abs( c );
> @@ -125,8 +126,8 @@ static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
> vec_s16_t s0 = vec_adds( t0, t1 );
> vec_s16_t s1 = vec_adds( t2, t3 );
>
> - vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) );
> - vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) );
> + vec_s32_t s01 = vec_sum4s( s0, zero_s32v );
> + vec_s32_t s23 = vec_sum4s( s1, zero_s32v );
>
> return vec_add( s01, s23 );
> }
> @@ -161,9 +162,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
>
> satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
>
> - satdv = vec_sum2s( satdv, zero_s32v );
> - satdv = vec_splat( satdv, 1 );
> - vec_ste( satdv, 0, &i_satd );
> + i_satd = vec_extract(satdv,0) + vec_extract(satdv,1);
>
> return i_satd >> 1;
> }
> @@ -280,9 +279,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
> satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
> temp4v, temp5v, temp6v, temp7v );
>
> - satdv = vec_sum2s( satdv, zero_s32v );
> - satdv = vec_splat( satdv, 1 );
> - vec_ste( satdv, 0, &i_satd );
> + i_satd = vec_extract(satdv,0) + vec_extract(satdv,1);
>
> return i_satd >> 1;
> }
> @@ -330,8 +327,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
> temp4v, temp5v, temp6v, temp7v );
>
> satdv = vec_sums( satdv, zero_s32v );
> - satdv = vec_splat( satdv, 3 );
> - vec_ste( satdv, 0, &i_satd );
> +
> + i_satd = vec_extract(satdv,3) ;
>
> return i_satd >> 1;
> }
> @@ -400,8 +397,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
> temp4v, temp5v, temp6v, temp7v ) );
>
> satdv = vec_sums( satdv, zero_s32v );
> - satdv = vec_splat( satdv, 3 );
> - vec_ste( satdv, 0, &i_satd );
> +
> + i_satd = vec_extract(satdv,3) ;
>
> return i_satd >> 1;
> }
> @@ -470,8 +467,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
> temp4v, temp5v, temp6v, temp7v ) );
>
> satdv = vec_sums( satdv, zero_s32v );
> - satdv = vec_splat( satdv, 3 );
> - vec_ste( satdv, 0, &i_satd );
> +
> + i_satd = vec_extract(satdv,3) ;
>
> return i_satd >> 1;
> }
> @@ -575,8 +572,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
> temp4v, temp5v, temp6v, temp7v ) );
>
> satdv = vec_sums( satdv, zero_s32v );
> - satdv = vec_splat( satdv, 3 );
> - vec_ste( satdv, 0, &i_satd );
> + i_satd = vec_extract(satdv,3) ;
>
> return i_satd >> 1;
> }
> diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
> index 3d1b685f..3755ce6d 100644
> --- a/common/ppc/ppccommon.h
> +++ b/common/ppc/ppccommon.h
> @@ -93,7 +93,7 @@ typedef union {
> /***********************************************************************
> * Null vector
> **********************************************************************/
> -#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
> +#define LOAD_ZERO const vec_u8_t zerov = vec_xor( zerov, zerov )
>
> #define zero_u8v (vec_u8_t) zerov
> #define zero_s8v (vec_s8_t) zerov
>
More information about the x264-devel
mailing list