[x264-devel] [PATCH 3/3] ppc: Use xxpermdi in VEC_STORE8

Sun Aug 19 17:27:55 CEST 2018

Around a ~2% speedup to the overall encoding for --slow.
---
 common/ppc/mc.c        |  3 ---
 common/ppc/ppccommon.h | 12 ++----------
 common/ppc/predict.c   |  2 --
 3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 2faddfd9..3ceb1ac8 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -51,7 +51,6 @@ static inline void pixel_avg2_w8_altivec( uint8_t *dst,  intptr_t i_dst,
                                           uint8_t *src2, int i_height )
 {
     vec_u8_t src1v, src2v;
-    PREP_STORE8;
 
     for( int y = 0; y < i_height; y++ )
     {
@@ -525,7 +524,6 @@ static void mc_chroma_8xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
     srcp = &src[i_src_stride];
 
     LOAD_ZERO;
-    PREP_STORE8;
     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
     vec_u8_t    dstuv, dstvv;
@@ -1098,7 +1096,6 @@ static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
                                   const x264_weight_t *weight, int i_height )
 {
     LOAD_ZERO;
-    PREP_STORE8;
     vec_u8_t srcv;
     vec_s16_t weightv;
     vec_s16_t scalev, offsetv, denomv, roundv;
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index fd9d6a7d..311e12a2 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -146,18 +146,10 @@ typedef union {
 #define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
 
 /***********************************************************************
- * PREP_STORE##n: declares required vectors to store n bytes to a
- *                potentially unaligned address
  * VEC_STORE##n:  stores n bytes from vector v to address p
  **********************************************************************/
-#define PREP_STORE8                                                    \
-    vec_u8_t _tmp3v;                                                   \
-    vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,  \
-                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F } \
-
-#define VEC_STORE8( v, p )           \
-    _tmp3v = vec_vsx_ld( 0, p );     \
-    v = vec_perm( v, _tmp3v, mask ); \
+#define VEC_STORE8( v, p )                        \
+    v = vec_xxpermdi( v, vec_vsx_ld( 0, p ), 1 ); \
     vec_vsx_st( v, 0, p )
 
 /***********************************************************************
diff --git a/common/ppc/predict.c b/common/ppc/predict.c
index 324b4c75..0b6bae42 100644
--- a/common/ppc/predict.c
+++ b/common/ppc/predict.c
@@ -58,8 +58,6 @@ static void predict_8x8c_p_altivec( uint8_t *src )
     vec_s16_t induc_v  = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
     vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);
 
-    PREP_STORE8;
-
     for( int i = 0; i < 8; ++i )
     {
         vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
-- 
2.12.2