[x264-devel] [PATCH] PPC: Improve SATD by using vec_extract and vec_xor.
Michail Alvanos
malvanos at gmail.com
Sat Apr 6 14:22:52 CEST 2019
Around 1-10% speedup of intra_satd_x3 and satd_ functions by
using vec_extract instead of vec_splat and vec_ste, and by
using vec_xor instead of vec_splat_u8 for zeroing the vector.
---
common/ppc/pixel.c | 28 ++++++++++++----------------
common/ppc/ppccommon.h | 2 +-
2 files changed, 13 insertions(+), 17 deletions(-)
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index f94029e3..c811d539 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -117,6 +117,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
vec_s16_t c, vec_s16_t d )
{
+ LOAD_ZERO;
vec_s16_t t0 = vec_abs( a );
vec_s16_t t1 = vec_abs( b );
vec_s16_t t2 = vec_abs( c );
@@ -125,8 +126,8 @@ static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
vec_s16_t s0 = vec_adds( t0, t1 );
vec_s16_t s1 = vec_adds( t2, t3 );
- vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) );
- vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) );
+ vec_s32_t s01 = vec_sum4s( s0, zero_s32v );
+ vec_s32_t s23 = vec_sum4s( s1, zero_s32v );
return vec_add( s01, s23 );
}
@@ -161,9 +162,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
- satdv = vec_sum2s( satdv, zero_s32v );
- satdv = vec_splat( satdv, 1 );
- vec_ste( satdv, 0, &i_satd );
+ i_satd = vec_extract(satdv,0) + vec_extract(satdv,1);
return i_satd >> 1;
}
@@ -280,9 +279,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v );
- satdv = vec_sum2s( satdv, zero_s32v );
- satdv = vec_splat( satdv, 1 );
- vec_ste( satdv, 0, &i_satd );
+ i_satd = vec_extract(satdv,0) + vec_extract(satdv,1);
return i_satd >> 1;
}
@@ -330,8 +327,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
@@ -400,8 +397,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v ) );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
@@ -470,8 +467,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v ) );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
@@ -575,8 +572,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v ) );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index 3d1b685f..3755ce6d 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -93,7 +93,7 @@ typedef union {
/***********************************************************************
* Null vector
**********************************************************************/
-#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+#define LOAD_ZERO const vec_u8_t zerov = vec_xor( zerov, zerov )
#define zero_u8v (vec_u8_t) zerov
#define zero_s8v (vec_s8_t) zerov
--
2.17.1
More information about the x264-devel
mailing list