[x264-devel] ppc: Use xxpermdi in VEC_STORE8
Luca Barbato
git at videolan.org
Tue Mar 12 19:31:56 CET 2019
x264 | branch: master | Luca Barbato <lu_zero at gentoo.org> | Sun Aug 19 17:27:55 2018 +0200| [0d111333bbd65b1a76b5c646abf802f45dd41e96] | committer: Anton Mitrofanov
ppc: Use xxpermdi in VEC_STORE8
Around a ~2% speedup to the overall encoding for --slow.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0d111333bbd65b1a76b5c646abf802f45dd41e96
---
common/ppc/mc.c | 3 ---
common/ppc/ppccommon.h | 13 ++-----------
common/ppc/predict.c | 3 ---
3 files changed, 2 insertions(+), 17 deletions(-)
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 2faddfd9..3ceb1ac8 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -51,7 +51,6 @@ static inline void pixel_avg2_w8_altivec( uint8_t *dst, intptr_t i_dst,
uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
- PREP_STORE8;
for( int y = 0; y < i_height; y++ )
{
@@ -525,7 +524,6 @@ static void mc_chroma_8xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
srcp = &src[i_src_stride];
LOAD_ZERO;
- PREP_STORE8;
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
vec_u8_t dstuv, dstvv;
@@ -1098,7 +1096,6 @@ static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_STORE8;
vec_u8_t srcv;
vec_s16_t weightv;
vec_s16_t scalev, offsetv, denomv, roundv;
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index fd9d6a7d..51936e0a 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -146,19 +146,10 @@ typedef union {
#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
/***********************************************************************
- * PREP_STORE##n: declares required vectors to store n bytes to a
- * potentially unaligned address
* VEC_STORE##n: stores n bytes from vector v to address p
**********************************************************************/
-#define PREP_STORE8 \
- vec_u8_t _tmp3v; \
- vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F } \
-
-#define VEC_STORE8( v, p ) \
- _tmp3v = vec_vsx_ld( 0, p ); \
- v = vec_perm( v, _tmp3v, mask ); \
- vec_vsx_st( v, 0, p )
+#define VEC_STORE8( v, p ) \
+ vec_vsx_st( vec_xxpermdi( v, vec_vsx_ld( 0, p ), 1 ), 0, p )
/***********************************************************************
* VEC_TRANSPOSE_8
diff --git a/common/ppc/predict.c b/common/ppc/predict.c
index 324b4c75..e3d84a49 100644
--- a/common/ppc/predict.c
+++ b/common/ppc/predict.c
@@ -58,8 +58,6 @@ static void predict_8x8c_p_altivec( uint8_t *src )
vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);
- PREP_STORE8;
-
for( int i = 0; i < 8; ++i )
{
vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
@@ -67,7 +65,6 @@ static void predict_8x8c_p_altivec( uint8_t *src )
VEC_STORE8(com_sat_v, &src[0]);
src += FDEC_STRIDE;
add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
-
}
}
More information about the x264-devel
mailing list