[x264-devel] [PATCH] PPC: Improve SATD by using vec_extract and vec_xor.

Sat Apr 6 15:03:08 CEST 2019

On 06/04/2019 14:22, Michail Alvanos wrote:
> Around 1-10% speedup of intra_satd_x3 and satd_ functions by
> using vec_extract instead of vec_splat and vec_ste, and by
> using vec_xor instead of vec_splat_u8 for zeroing the vector.

vec_extract sounds right, vec_xor vs vec_splat sounds fishy.

Could you please split the two changes?

lu

> ---
>   common/ppc/pixel.c     | 28 ++++++++++++----------------
>   common/ppc/ppccommon.h |  2 +-
>   2 files changed, 13 insertions(+), 17 deletions(-)
> 
> diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
> index f94029e3..c811d539 100644
> --- a/common/ppc/pixel.c
> +++ b/common/ppc/pixel.c
> @@ -117,6 +117,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
>   static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
>                                             vec_s16_t c, vec_s16_t d )
>   {
> +    LOAD_ZERO;
>       vec_s16_t t0 = vec_abs( a );
>       vec_s16_t t1 = vec_abs( b );
>       vec_s16_t t2 = vec_abs( c );
> @@ -125,8 +126,8 @@ static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
>       vec_s16_t s0 = vec_adds( t0, t1 );
>       vec_s16_t s1 = vec_adds( t2, t3 );
>   
> -    vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) );
> -    vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) );
> +    vec_s32_t s01 = vec_sum4s( s0, zero_s32v );
> +    vec_s32_t s23 = vec_sum4s( s1, zero_s32v );
>   
>       return vec_add( s01, s23 );
>   }
> @@ -161,9 +162,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
>   
>       satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
>   
> -    satdv = vec_sum2s( satdv, zero_s32v );
> -    satdv = vec_splat( satdv, 1 );
> -    vec_ste( satdv, 0, &i_satd );
> +    i_satd =  vec_extract(satdv,0) + vec_extract(satdv,1);
>   
>       return i_satd >> 1;
>   }
> @@ -280,9 +279,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
>       satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
>                          temp4v, temp5v, temp6v, temp7v );
>   
> -    satdv = vec_sum2s( satdv, zero_s32v );
> -    satdv = vec_splat( satdv, 1 );
> -    vec_ste( satdv, 0, &i_satd );
> +    i_satd =  vec_extract(satdv,0) + vec_extract(satdv,1);
>   
>       return i_satd >> 1;
>   }
> @@ -330,8 +327,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
>                          temp4v, temp5v, temp6v, temp7v );
>   
>       satdv = vec_sums( satdv, zero_s32v );
> -    satdv = vec_splat( satdv, 3 );
> -    vec_ste( satdv, 0, &i_satd );
> +
> +    i_satd =  vec_extract(satdv,3) ;
>   
>       return i_satd >> 1;
>   }
> @@ -400,8 +397,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
>                                          temp4v, temp5v, temp6v, temp7v ) );
>   
>       satdv = vec_sums( satdv, zero_s32v );
> -    satdv = vec_splat( satdv, 3 );
> -    vec_ste( satdv, 0, &i_satd );
> +
> +    i_satd =  vec_extract(satdv,3) ;
>   
>       return i_satd >> 1;
>   }
> @@ -470,8 +467,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
>                                          temp4v, temp5v, temp6v, temp7v ) );
>   
>       satdv = vec_sums( satdv, zero_s32v );
> -    satdv = vec_splat( satdv, 3 );
> -    vec_ste( satdv, 0, &i_satd );
> +
> +    i_satd =  vec_extract(satdv,3) ;
>   
>       return i_satd >> 1;
>   }
> @@ -575,8 +572,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
>                                          temp4v, temp5v, temp6v, temp7v ) );
>   
>       satdv = vec_sums( satdv, zero_s32v );
> -    satdv = vec_splat( satdv, 3 );
> -    vec_ste( satdv, 0, &i_satd );
> +    i_satd =  vec_extract(satdv,3) ;
>   
>       return i_satd >> 1;
>   }
> diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
> index 3d1b685f..3755ce6d 100644
> --- a/common/ppc/ppccommon.h
> +++ b/common/ppc/ppccommon.h
> @@ -93,7 +93,7 @@ typedef union {
>   /***********************************************************************
>    * Null vector
>    **********************************************************************/
> -#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
> +#define LOAD_ZERO const vec_u8_t zerov = vec_xor( zerov, zerov )
>   
>   #define zero_u8v  (vec_u8_t)  zerov
>   #define zero_s8v  (vec_s8_t)  zerov
>