[x264-devel] [PATCH] PPC: Improve SATD by using vec_extract.

Michail Alvanos malvanos at gmail.com
Wed Jul 10 21:01:04 CEST 2019


Around 1-10% speedup of intra_satd_x3 and satd_ functions by
using vec_extract instead of vec_splat and vec_ste.

Microbenchmark results:

Power 8:
satd_4x4_altivec: 148 ==> satd_4x4_altivec: 87
satd_4x8_altivec: 186 ==> satd_4x8_altivec: 128
satd_8x4_altivec: 177 ==> satd_8x4_altivec: 114
satd_8x8_altivec: 188 ==> satd_8x8_altivec: 136
satd_8x16_altivec: 300 ==> satd_8x16_altivec: 262
satd_16x8_altivec: 269 ==> satd_16x8_altivec: 228
satd_16x16_altivec: 517 ==> satd_16x16_altivec: 485
intra_satd_x3_4x4_altivec: 528 ==> intra_satd_x3_4x4_altivec: 444
intra_satd_x3_8x8c_altivec: 679 ==> intra_satd_x3_8x8c_altivec: 593
intra_satd_x3_16x16_altivec: 1815 ==> intra_satd_x3_16x16_altivec: 1724

Power 9:
satd_4x4_altivec: 131 ==> satd_4x4_altivec: 113
satd_4x8_altivec: 175 ==> satd_4x8_altivec: 155
satd_8x4_altivec: 150 ==> satd_8x4_altivec: 135
satd_8x8_altivec: 174 ==> satd_8x8_altivec: 161
satd_8x16_altivec: 290 ==> satd_8x16_altivec: 277
satd_16x8_altivec: 272 ==> satd_16x8_altivec: 270
satd_16x16_altivec: 563 ==> satd_16x16_altivec: 566
intra_satd_x3_4x4_altivec: 424 ==> intra_satd_x3_4x4_altivec: 400
intra_satd_x3_8x8c_altivec: 687 ==> intra_satd_x3_8x8c_altivec: 616
intra_satd_x3_16x16_altivec: 2047 ==> intra_satd_x3_16x16_altivec: 2062


---
 common/ppc/pixel.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index f94029e3..59a74afc 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -161,9 +161,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
 
     satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
 
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
+    i_satd =  vec_extract(satdv,0) + vec_extract(satdv,1);
 
     return i_satd >> 1;
 }
@@ -207,9 +205,7 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
 
     satdv = vec_add( satdv, add_abs_4( temp0v, temp1v, temp2v, temp3v ) );
 
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
+    i_satd =  vec_extract(satdv,0) + vec_extract(satdv,1);
 
     return i_satd >> 1;
 }
@@ -280,9 +276,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
     satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
                        temp4v, temp5v, temp6v, temp7v );
 
-    satdv = vec_sum2s( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 1 );
-    vec_ste( satdv, 0, &i_satd );
+    i_satd =  vec_extract(satdv,0) + vec_extract(satdv,1);
 
     return i_satd >> 1;
 }
@@ -330,8 +324,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
                        temp4v, temp5v, temp6v, temp7v );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
@@ -400,8 +394,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
                                        temp4v, temp5v, temp6v, temp7v ) );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
@@ -470,8 +464,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
                                        temp4v, temp5v, temp6v, temp7v ) );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
@@ -575,8 +569,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
                                        temp4v, temp5v, temp6v, temp7v ) );
 
     satdv = vec_sums( satdv, zero_s32v );
-    satdv = vec_splat( satdv, 3 );
-    vec_ste( satdv, 0, &i_satd );
+    i_satd =  vec_extract(satdv,3) ;
 
     return i_satd >> 1;
 }
-- 
2.17.1



More information about the x264-devel mailing list