[x264-devel] ppc: Use xxpermdi in VEC_STORE8

Tue Mar 12 19:31:56 CET 2019

x264 | branch: master | Luca Barbato <lu_zero at gentoo.org> | Sun Aug 19 17:27:55 2018 +0200| [0d111333bbd65b1a76b5c646abf802f45dd41e96] | committer: Anton Mitrofanov

ppc: Use xxpermdi in VEC_STORE8

Around a ~2% speedup to the overall encoding for --slow.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0d111333bbd65b1a76b5c646abf802f45dd41e96
---

 common/ppc/mc.c        |  3 ---
 common/ppc/ppccommon.h | 13 ++-----------
 common/ppc/predict.c   |  3 ---
 3 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 2faddfd9..3ceb1ac8 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -51,7 +51,6 @@ static inline void pixel_avg2_w8_altivec( uint8_t *dst,  intptr_t i_dst,
                                           uint8_t *src2, int i_height )
 {
     vec_u8_t src1v, src2v;
-    PREP_STORE8;
 
     for( int y = 0; y < i_height; y++ )
     {
@@ -525,7 +524,6 @@ static void mc_chroma_8xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
     srcp = &src[i_src_stride];
 
     LOAD_ZERO;
-    PREP_STORE8;
     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
     vec_u8_t    dstuv, dstvv;
@@ -1098,7 +1096,6 @@ static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
                                   const x264_weight_t *weight, int i_height )
 {
     LOAD_ZERO;
-    PREP_STORE8;
     vec_u8_t srcv;
     vec_s16_t weightv;
     vec_s16_t scalev, offsetv, denomv, roundv;
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index fd9d6a7d..51936e0a 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -146,19 +146,10 @@ typedef union {
 #define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
 
 /***********************************************************************
- * PREP_STORE##n: declares required vectors to store n bytes to a
- *                potentially unaligned address
  * VEC_STORE##n:  stores n bytes from vector v to address p
  **********************************************************************/
-#define PREP_STORE8                                                    \
-    vec_u8_t _tmp3v;                                                   \
-    vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,  \
-                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F } \
-
-#define VEC_STORE8( v, p )           \
-    _tmp3v = vec_vsx_ld( 0, p );     \
-    v = vec_perm( v, _tmp3v, mask ); \
-    vec_vsx_st( v, 0, p )
+#define VEC_STORE8( v, p ) \
+    vec_vsx_st( vec_xxpermdi( v, vec_vsx_ld( 0, p ), 1 ), 0, p )
 
 /***********************************************************************
  * VEC_TRANSPOSE_8
diff --git a/common/ppc/predict.c b/common/ppc/predict.c
index 324b4c75..e3d84a49 100644
--- a/common/ppc/predict.c
+++ b/common/ppc/predict.c
@@ -58,8 +58,6 @@ static void predict_8x8c_p_altivec( uint8_t *src )
     vec_s16_t induc_v  = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
     vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);
 
-    PREP_STORE8;
-
     for( int i = 0; i < 8; ++i )
     {
         vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
@@ -67,7 +65,6 @@ static void predict_8x8c_p_altivec( uint8_t *src )
         VEC_STORE8(com_sat_v, &src[0]);
         src += FDEC_STRIDE;
         add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
-
     }
 }