[x264-devel] ppc: Rework satd_4* likewise
Luca Barbato
git at videolan.org
Tue Mar 12 19:31:52 CET 2019
x264 | branch: master | Luca Barbato <lu_zero at gentoo.org> | Sun Aug 19 09:28:42 2018 +0200| [83acefef8990302caf962c77e5a8189bb620ca6f] | committer: Anton Mitrofanov
ppc: Rework satd_4* likewise
Now 4x4 is as slow as C and 4x8 is a 2% faster than before.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=83acefef8990302caf962c77e5a8189bb620ca6f
---
common/ppc/pixel.c | 34 ++++++++++++++++++++++------------
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index de1877a0..5bb2bec6 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -114,6 +114,23 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
VEC_ABS( a ); \
c = vec_sum4s( a, b )
+static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
+ vec_s16_t c, vec_s16_t d )
+{
+ vec_s16_t t0 = vec_abs( a );
+ vec_s16_t t1 = vec_abs( b );
+ vec_s16_t t2 = vec_abs( c );
+ vec_s16_t t3 = vec_abs( d );
+
+ vec_s16_t s0 = vec_adds( t0, t1 );
+ vec_s16_t s1 = vec_adds( t2, t3 );
+
+ vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) );
+ vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) );
+
+ return vec_add( s01, s23 );
+}
+
/***********************************************************************
* SATD 4x4
**********************************************************************/
@@ -142,10 +159,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
- VEC_ADD_ABS( temp0v, zero_s32v, satdv );
- VEC_ADD_ABS( temp1v, satdv, satdv );
- VEC_ADD_ABS( temp2v, satdv, satdv );
- VEC_ADD_ABS( temp3v, satdv, satdv );
+ satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
satdv = vec_sum2s( satdv, zero_s32v );
satdv = vec_splat( satdv, 1 );
@@ -177,10 +191,8 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
diff0v, diff1v, diff2v, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
- VEC_ADD_ABS( temp0v, zero_s32v, satdv );
- VEC_ADD_ABS( temp1v, satdv, satdv );
- VEC_ADD_ABS( temp2v, satdv, satdv );
- VEC_ADD_ABS( temp3v, satdv, satdv );
+
+ satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
@@ -192,10 +204,8 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
diff0v, diff1v, diff2v, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
- VEC_ADD_ABS( temp0v, satdv, satdv );
- VEC_ADD_ABS( temp1v, satdv, satdv );
- VEC_ADD_ABS( temp2v, satdv, satdv );
- VEC_ADD_ABS( temp3v, satdv, satdv );
+
+ satdv = vec_add( satdv, add_abs_4( temp0v, temp1v, temp2v, temp3v ) );
satdv = vec_sum2s( satdv, zero_s32v );
satdv = vec_splat( satdv, 1 );
More information about the x264-devel
mailing list