[x264-devel] [PATCH] PPC: Improve SATD by using vec_extract and vec_xor.

Michail Alvanos malvanos at gmail.com
Sat Apr 6 14:22:52 CEST 2019


Around 1-10% speedup of intra_satd_x3 and satd_ functions by
using vec_extract instead of vec_splat and vec_ste, and by
using vec_xor instead of vec_splat_u8 for zeroing the vector.

---
 common/ppc/pixel.c     | 28 ++++++++++++----------------
 common/ppc/ppccommon.h |  2 +-
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index f94029e3..c811d539 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -117,6 +117,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
 static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
                                           vec_s16_t c, vec_s16_t d )
 {
+    LOAD_ZERO;
     vec_s16_t t0 = vec_abs( a );
     vec_s16_t t1 = vec_abs( b );
     vec_s16_t t2 = vec_abs( c );
@@ -125,8 +126,8 @@ static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
     vec_s16_t s0 = vec_adds( t0, t1 );
     vec_s16_t s1 = vec_adds( t2, t3 );
 
-    vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) );
-    vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) );
+    vec_s32_t s01 = vec_sum4s( s0, zero_s32v );
+    vec_s32_t s23 = vec_sum4s( s1, zero_s32v );
 
     return vec_add( s01, s23 );
 }
@@ -161,9 +162,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
 
     satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
 
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
+    i_satd =  vec_extract(satdv,0) + vec_extract(satdv,1);
 
     return i_satd >> 1;
 }
@@ -280,9 +279,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
     satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
                        temp4v, temp5v, temp6v, temp7v );
 
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
+    i_satd =  vec_extract(satdv,0) + vec_extract(satdv,1);
 
     return i_satd >> 1;
 }
@@ -330,8 +327,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
                        temp4v, temp5v, temp6v, temp7v );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
@@ -400,8 +397,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
                                        temp4v, temp5v, temp6v, temp7v ) );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
@@ -470,8 +467,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
                                        temp4v, temp5v, temp6v, temp7v ) );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
@@ -575,8 +572,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
                                        temp4v, temp5v, temp6v, temp7v ) );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index 3d1b685f..3755ce6d 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -93,7 +93,7 @@ typedef union {
 /***********************************************************************
  * Null vector
  **********************************************************************/
-#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+#define LOAD_ZERO const vec_u8_t zerov = vec_xor( zerov, zerov )
 
 #define zero_u8v  (vec_u8_t)  zerov
 #define zero_s8v  (vec_s8_t)  zerov
-- 
2.17.1



More information about the x264-devel mailing list