[x264-devel] [PATCH 3/3] ppc: Rework satd_4* likewise

Luca Barbato lu_zero at gentoo.org
Sun Aug 19 09:28:42 CEST 2018


Now 4x4 is as slow as C and 4x8 is a 2% faster than before.
---
 common/ppc/pixel.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 660393fa..abb0f59b 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -114,6 +114,25 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
     VEC_ABS( a );          \
     c = vec_sum4s( a, b )
 
+static ALWAYS_INLINE vec_s32_t add_abs_4(
+        vec_s16_t a,
+        vec_s16_t b,
+        vec_s16_t c,
+        vec_s16_t d) {
+    vec_s16_t t0 = vec_abs( a );
+    vec_s16_t t1 = vec_abs( b );
+    vec_s16_t t2 = vec_abs( c );
+    vec_s16_t t3 = vec_abs( d );
+
+    vec_s16_t s0 = vec_adds(t0, t1);
+    vec_s16_t s1 = vec_adds(t2, t3);
+
+    vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32(0) );
+    vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32(0) );
+
+    return vec_add(s01, s23);
+}
+
 /***********************************************************************
  * SATD 4x4
  **********************************************************************/
@@ -142,10 +161,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                  temp0v, temp1v, temp2v, temp3v );
 
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
+    satdv = add_abs_4(temp0v, temp1v, temp2v, temp3v);
 
     satdv = vec_sum2s( satdv, zero_s32v );
     satdv = vec_splat( satdv, 1 );
@@ -177,10 +193,8 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
                      diff0v, diff1v, diff2v, diff3v );
     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                  temp0v, temp1v, temp2v, temp3v );
-    VEC_ADD_ABS( temp0v, zero_s32v, satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
+
+    satdv = add_abs_4(temp0v, temp1v, temp2v, temp3v);
 
     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
@@ -192,10 +206,8 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
                      diff0v, diff1v, diff2v, diff3v );
     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                  temp0v, temp1v, temp2v, temp3v );
-    VEC_ADD_ABS( temp0v, satdv,     satdv );
-    VEC_ADD_ABS( temp1v, satdv,     satdv );
-    VEC_ADD_ABS( temp2v, satdv,     satdv );
-    VEC_ADD_ABS( temp3v, satdv,     satdv );
+
+    satdv = vec_add(satdv, add_abs_4(temp0v, temp1v, temp2v, temp3v));
 
     satdv = vec_sum2s( satdv, zero_s32v );
     satdv = vec_splat( satdv, 1 );
-- 
2.12.2



More information about the x264-devel mailing list