[x264-devel] [PATCH 5/6] ppc: Use vec_vsx_ld instead of VEC_LOAD/STORE macros, remove
Luca Barbato
lu_zero at gentoo.org
Tue Nov 1 23:16:17 CET 2016
From: Alexandra Hajkova <alexandra.khirnova at gmail.com>
VEC_LOAD*, some of VEC_STORE* macros, some PREP* macros and VEC_DIFF_H_OFFSET macro
Address the issue #2: Make sure the functions do not use deprected
primitives
---
common/ppc/dct.c | 61 +++---
common/ppc/deblock.c | 35 ++--
common/ppc/mc.c | 174 ++++++++--------
common/ppc/pixel.c | 524 ++++++++++++-------------------------------------
common/ppc/ppccommon.h | 112 ++---------
5 files changed, 255 insertions(+), 651 deletions(-)
diff --git a/common/ppc/dct.c b/common/ppc/dct.c
index 901659e..6dc0447 100644
--- a/common/ppc/dct.c
+++ b/common/ppc/dct.c
@@ -258,11 +258,10 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix
va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
vec_ste(va_u32, element, (uint32_t*)dst);
-#define ALTIVEC_STORE4_SUM_CLIP(dest, idctv, perm_ldv) \
+#define ALTIVEC_STORE4_SUM_CLIP(dest, idctv) \
{ \
/* unaligned load */ \
- vec_u8_t lv = vec_ld(0, dest); \
- vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
+ vec_u8_t dstv = vec_vsx_ld(0, dest); \
vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
vec_u16_t dst16 = vec_u8_to_u16_h(dstv); \
vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
@@ -296,14 +295,13 @@ void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] )
vec_s16_t idct0, idct1, idct2, idct3;
IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );
- vec_u8_t perm_ldv = vec_lvsl( 0, dst );
vec_u16_t sixv = vec_splat_u16(6);
LOAD_ZERO;
- ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
- ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
- ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
- ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
+ ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0 );
+ ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1 );
+ ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2 );
+ ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3 );
}
void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] )
@@ -377,25 +375,15 @@ void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] )
d7 = vec_sub(b0v, b7v); \
}
-#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel)\
-{\
- /* unaligned load */ \
- vec_u8_t hv = vec_ld( 0, dest ); \
- vec_u8_t lv = vec_ld( 7, dest ); \
- vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
- vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
- vec_u16_t dst16 = vec_u8_to_u16_h(dstv); \
- vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
- vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
- /* unaligned store */ \
- vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
- vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
- lv = vec_sel( lv, bodyv, edgelv ); \
- vec_st( lv, 7, dest ); \
- hv = vec_ld( 0, dest ); \
- vec_u8_t edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
- hv = vec_sel( hv, bodyv, edgehv ); \
- vec_st( hv, 0, dest ); \
+#define ALTIVEC_STORE_SUM_CLIP(dest, idctv) \
+{ \
+ vec_s16_t idct_sh6 = vec_sra( idctv, sixv ); \
+ /* unaligned load */ \
+ vec_u8_t dstv = vec_vsx_ld( 0, dest ); \
+ vec_s16_t idstsum = vec_adds( idct_sh6, vec_u8_to_s16_h(dstv) ); \
+ vec_u8_t idstsum8 = vec_packsu( idstsum, vec_u8_to_s16_l(dstv) ); \
+ /* unaligned store */ \
+ vec_vsx_st( idstsum8, 0, dest ); \
}
void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] )
@@ -428,20 +416,17 @@ void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] )
IDCT8_1D_ALTIVEC(tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7,
idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
- vec_u8_t perm_ldv = vec_lvsl(0, dst);
- vec_u8_t perm_stv = vec_lvsr(8, dst);
vec_u16_t sixv = vec_splat_u16(6);
- const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
LOAD_ZERO;
- ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0, perm_ldv, perm_stv, sel);
- ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1, perm_ldv, perm_stv, sel);
- ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2, perm_ldv, perm_stv, sel);
- ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3, perm_ldv, perm_stv, sel);
- ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4, perm_ldv, perm_stv, sel);
- ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5, perm_ldv, perm_stv, sel);
- ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6, perm_ldv, perm_stv, sel);
- ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7, perm_ldv, perm_stv, sel);
+ ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0);
+ ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1);
+ ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2);
+ ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3);
+ ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4);
+ ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5);
+ ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6);
+ ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7);
}
void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][64] )
diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c
index bf5c833..f18db0b 100644
--- a/common/ppc/deblock.c
+++ b/common/ppc/deblock.c
@@ -80,23 +80,22 @@ static inline void write16x4( uint8_t *dst, int dst_stride,
#define read_and_transpose16x6(src, src_stride, r8, r9, r10, r11, r12, r13)\
{\
register vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;\
- VEC_LOAD(src, r0, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + src_stride, r1, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 2*src_stride, r2, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 3*src_stride, r3, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 4*src_stride, r4, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 5*src_stride, r5, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 6*src_stride, r6, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 7*src_stride, r7, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t, pix ); \
- \
- VEC_LOAD(src + 8*src_stride, r8, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 9*src_stride, r9, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t, pix ); \
- VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t, pix ); \
+ r0 = vec_vsx_ld(0, src); \
+ r1 = vec_vsx_ld(src_stride, src); \
+ r2 = vec_vsx_ld(2*src_stride, src); \
+ r3 = vec_vsx_ld(3*src_stride, src); \
+ r4 = vec_vsx_ld(4*src_stride, src); \
+ r5 = vec_vsx_ld(5*src_stride, src); \
+ r6 = vec_vsx_ld(6*src_stride, src); \
+ r7 = vec_vsx_ld(7*src_stride, src); \
+ r8 = vec_vsx_ld(8*src_stride, src); \
+ r9 = vec_vsx_ld(9*src_stride, src); \
+ r10 = vec_vsx_ld(10*src_stride, src); \
+ r11 = vec_vsx_ld(11*src_stride, src); \
+ r12 = vec_vsx_ld(12*src_stride, src); \
+ r13 = vec_vsx_ld(13*src_stride, src); \
+ r14 = vec_vsx_ld(14*src_stride, src); \
+ r15 = vec_vsx_ld(15*src_stride, src); \
\
/*Merge first pairs*/ \
r0 = vec_mergeh(r0, r8); /*0, 8*/ \
@@ -291,8 +290,6 @@ void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int
register vec_u8_t line0, line1, line2, line3, line4, line5;
if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0 )
return;
- PREP_LOAD;
- vec_u8_t _pix_ = vec_lvsl(0, pix-3);
read_and_transpose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
transpose4x16(line1, line2, line3, line4);
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index e169166..e696bea 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -64,17 +64,15 @@ static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, intptr_t i_dst,
uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
- PREP_LOAD;
PREP_STORE8;
- PREP_LOAD_SRC( src1 );
- PREP_LOAD_SRC( src2 );
for( int y = 0; y < i_height; y++ )
{
- VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
- VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
+ src1v = vec_vsx_ld( 0, src1 );
+ src2v = vec_vsx_ld( 0, src2 );
src1v = vec_avg( src1v, src2v );
- VEC_STORE8( src1v, dst );
+
+ VEC_STORE8(src1v, dst);
dst += i_dst;
src1 += i_src1;
@@ -87,14 +85,11 @@ static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, intptr_t i_dst,
uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
- PREP_LOAD;
- PREP_LOAD_SRC( src1 );
- PREP_LOAD_SRC( src2 );
for( int y = 0; y < i_height; y++ )
{
- VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
- VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
+ src1v = vec_vsx_ld( 0, src1 );
+ src2v = vec_vsx_ld( 0, src2 );
src1v = vec_avg( src1v, src2v );
vec_st(src1v, 0, dst);
@@ -133,12 +128,10 @@ static void x264_mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst,
uint8_t *src, intptr_t i_src, int i_height )
{
vec_u8_t cpyV;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
for( int y = 0; y < i_height; y++ )
{
- VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
+ cpyV = vec_vsx_ld( 0, src );
vec_st(cpyV, 0, dst);
src += i_src;
@@ -307,8 +300,6 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
srcp = &src[i_src_stride];
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src2v_8, dstuv, dstvv;
vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
@@ -330,7 +321,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
shiftv = vec_splat_u16( 6 );
- VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, src );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
@@ -338,7 +329,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
{
src0v_16 = src2v_16;
src1v_16 = src3v_16;
- VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
@@ -360,7 +351,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
src0v_16 = src2v_16;
src1v_16 = src3v_16;
- VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
@@ -400,8 +391,6 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
srcp = &src[i_src_stride];
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
PREP_STORE8;
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
@@ -426,16 +415,16 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
#endif
- VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
- VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, src );
+ src3v_8 = vec_vsx_ld( 16, src );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
for( int y = 0; y < i_height; y += 2 )
{
src0v_8 = src2v_8;
src1v_8 = src3v_8;
- VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
- VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
+ src3v_8 = vec_vsx_ld( 16, srcp );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
@@ -472,8 +461,8 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
src0v_8 = src2v_8;
src1v_8 = src3v_8;
- VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
- VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
+ src3v_8 = vec_vsx_ld( 16, srcp );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
@@ -555,8 +544,8 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
#define HPEL_FILTER_HORIZONTAL() \
{ \
- VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
- VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
+ src1v = vec_vsx_ld( x- 2+i_stride*y, src ); \
+ src6v = vec_vsx_ld( x+14+i_stride*y, src ); \
\
src2v = VSLD( src1v, src6v, 1 ); \
src3v = VSLD( src1v, src6v, 2 ); \
@@ -592,17 +581,17 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \
+ vec_vsx_st( destv, x+i_stride*y, dsth ); \
}
#define HPEL_FILTER_VERTICAL() \
{ \
- VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
+ src1v = vec_vsx_ld( x+i_stride*(y-2), src ); \
+ src2v = vec_vsx_ld( x+i_stride*(y-1), src ); \
+ src3v = vec_vsx_ld( x+i_stride*(y-0), src ); \
+ src4v = vec_vsx_ld( x+i_stride*(y+1), src ); \
+ src5v = vec_vsx_ld( x+i_stride*(y+2), src ); \
+ src6v = vec_vsx_ld( x+i_stride*(y+3), src ); \
\
temp1v = vec_u8_to_s16_h( src1v ); \
temp2v = vec_u8_to_s16_h( src2v ); \
@@ -632,7 +621,7 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \
+ vec_vsx_st( destv, x+i_stride*y, dstv ); \
}
#define HPEL_FILTER_CENTRAL() \
@@ -665,7 +654,7 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
+ vec_vsx_st( destv, x-16+i_stride*y, dstc ); \
}
void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
@@ -677,10 +666,6 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint
vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
- PREP_LOAD;
- PREP_LOAD_SRC( src);
- PREP_STORE16;
- PREP_STORE16_DST( dsth );
LOAD_ZERO;
vec_u16_t twov, fourv, fivev, sixv;
@@ -736,12 +721,12 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint
}
/* Partial vertical filter */
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
+ src1v = vec_vsx_ld( x+i_stride*(y-2), src );
+ src2v = vec_vsx_ld( x+i_stride*(y-1), src );
+ src3v = vec_vsx_ld( x+i_stride*(y-0), src );
+ src4v = vec_vsx_ld( x+i_stride*(y+1), src );
+ src5v = vec_vsx_ld( x+i_stride*(y+2), src );
+ src6v = vec_vsx_ld( x+i_stride*(y+3), src );
temp1v = vec_u8_to_s16_h( src1v );
temp2v = vec_u8_to_s16_h( src2v );
@@ -875,8 +860,6 @@ static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u8_t srcv;
vec_s16_t weightv;
vec_s16_t scalev, offsetv, denomv, roundv;
@@ -900,7 +883,7 @@ static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 2, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, roundv );
@@ -915,7 +898,7 @@ static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 2, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, offsetv );
@@ -929,8 +912,6 @@ static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u8_t srcv;
vec_s16_t weightv;
vec_s16_t scalev, offsetv, denomv, roundv;
@@ -954,7 +935,7 @@ static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 4, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, roundv );
@@ -969,7 +950,7 @@ static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 4, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, offsetv );
@@ -983,8 +964,6 @@ static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
PREP_STORE8;
vec_u8_t srcv;
vec_s16_t weightv;
@@ -1009,7 +988,7 @@ static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 8, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, roundv );
@@ -1024,7 +1003,7 @@ static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 8, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, offsetv );
@@ -1038,8 +1017,6 @@ static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, i
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u8_t srcv;
vec_s16_t weight_lv, weight_hv;
vec_s16_t scalev, offsetv, denomv, roundv;
@@ -1063,7 +1040,7 @@ static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, i
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 16, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weight_hv = vec_u8_to_s16_h( srcv );
weight_lv = vec_u8_to_s16_l( srcv );
@@ -1082,7 +1059,7 @@ static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, i
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 16, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weight_hv = vec_u8_to_s16_h( srcv );
weight_lv = vec_u8_to_s16_l( srcv );
@@ -1098,8 +1075,7 @@ static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, i
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD_SRC( src );
- vec_u8_t src_1v, src_2v, src_3v;
+ vec_u8_t srcv, srcv2;
vec_s16_t weight_lv, weight_hv, weight_3v;
vec_s16_t scalev, offsetv, denomv, roundv;
vec_s16_u loadv;
@@ -1114,60 +1090,68 @@ static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, i
if( denom >= 1 )
{
+ int16_t round = 1<<(denom - 1);
+ vec_s16_t tab[4] = {
+ { weight->i_scale, weight->i_scale, weight->i_scale, weight->i_scale, 1, 1, 1, 1 },
+ { weight->i_offset, weight->i_offset, weight->i_offset, weight->i_offset, 0, 0, 0, 0 },
+ { denom, denom, denom, denom, 0, 0, 0, 0 },
+ { round, round, round, round, 0, 0, 0, 0 },
+ };
+
loadv.s[0] = denom;
denomv = vec_splat( loadv.v, 0 );
- loadv.s[0] = 1<<(denom - 1);
+ loadv.s[0] = round;
roundv = vec_splat( loadv.v, 0 );
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- src_1v = vec_ld( 0, src );
- src_2v = vec_ld( 16, src );
- src_3v = vec_ld( 19, src );
- src_1v = vec_perm( src_1v, src_2v, _src_ );
- src_3v = vec_perm( src_2v, src_3v, _src_ );
- weight_hv = vec_u8_to_s16_h( src_1v );
- weight_lv = vec_u8_to_s16_l( src_1v );
- weight_3v = vec_u8_to_s16_h( src_3v );
+ srcv = vec_vsx_ld( 0, src );
+ srcv2 = vec_vsx_ld( 16, src );
+
+ weight_hv = vec_u8_to_s16_h( srcv );
+ weight_lv = vec_u8_to_s16_l( srcv );
+ weight_3v = vec_u8_to_s16_h( srcv2 );
weight_hv = vec_mladd( weight_hv, scalev, roundv );
weight_lv = vec_mladd( weight_lv, scalev, roundv );
- weight_3v = vec_mladd( weight_3v, scalev, roundv );
+ weight_3v = vec_mladd( weight_3v, tab[0], tab[3] );
+
weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
- weight_3v = vec_sra( weight_3v, (vec_u16_t)denomv );
+ weight_3v = vec_sra( weight_3v, (vec_u16_t)tab[2] );
+
weight_hv = vec_add( weight_hv, offsetv );
weight_lv = vec_add( weight_lv, offsetv );
- weight_3v = vec_add( weight_3v, offsetv );
+ weight_3v = vec_add( weight_3v, tab[1] );
- src_1v = vec_packsu( weight_hv, weight_lv );
- src_3v = vec_packsu( weight_3v, zero_s16v );
- vec_st( src_1v, 0, dst );
- vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
+ srcv = vec_packsu( weight_hv, weight_lv );
+ srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ));
+ vec_vsx_st( srcv, 0, dst );
+ vec_vsx_st( srcv2, 16, dst );
}
}
else
{
+ vec_s16_t offset_mask = { weight->i_offset, weight->i_offset, weight->i_offset,
+ weight->i_offset, 0, 0, 0, 0 };
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- src_1v = vec_ld( 0, src );
- src_2v = vec_ld( 16, src );
- src_3v = vec_ld( 19, src );
- src_1v = vec_perm( src_1v, src_2v, _src_ );
- src_3v = vec_perm( src_2v, src_3v, _src_ );
- weight_hv = vec_u8_to_s16_h( src_1v );
- weight_lv = vec_u8_to_s16_l( src_1v );
- weight_3v = vec_u8_to_s16_h( src_3v );
+ srcv = vec_vsx_ld( 0, src );
+ srcv2 = vec_vsx_ld( 16, src );
+
+ weight_hv = vec_u8_to_s16_h( srcv );
+ weight_lv = vec_u8_to_s16_l( srcv );
+ weight_3v = vec_u8_to_s16_h( srcv2 );
weight_hv = vec_mladd( weight_hv, scalev, offsetv );
weight_lv = vec_mladd( weight_lv, scalev, offsetv );
- weight_3v = vec_mladd( weight_3v, scalev, offsetv );
+ weight_3v = vec_mladd( weight_3v, scalev, offset_mask );
- src_1v = vec_packsu( weight_hv, weight_lv );
- src_3v = vec_packsu( weight_3v, zero_s16v );
- vec_st( src_1v, 0, dst );
- vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
+ srcv = vec_packsu( weight_hv, weight_lv );
+ srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ));
+ vec_vsx_st(srcv, 0, dst);
+ vec_vsx_st(srcv2, 16, dst);
}
}
}
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 5ace725..a709b0f 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -39,13 +39,12 @@ static int name( uint8_t *pix1, intptr_t i_pix1, \
ALIGNED_16( int sum ); \
\
LOAD_ZERO; \
- PREP_LOAD; \
vec_u8_t pix1v, pix2v; \
vec_s32_t sumv = zero_s32v; \
for( int y = 0; y < ly; y++ ) \
{ \
- VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t ); \
- VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t ); \
+ pix1v = vec_vsx_ld( 0, pix1 ); \
+ pix2v = vec_vsx_ld( 0, pix2 ); \
sumv = (vec_s32_t) vec_sum4s( \
vec_sub( vec_max( pix1v, pix2v ), \
vec_min( pix1v, pix2v ) ), \
@@ -124,19 +123,14 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
ALIGNED_16( int i_satd );
PREP_DIFF;
- PREP_LOAD_SRC( pix1 );
vec_s16_t diff0v, diff1v, diff2v, diff3v;
vec_s16_t temp0v, temp1v, temp2v, temp3v;
vec_s32_t satdv;
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
/* Hadamar H */
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
@@ -173,14 +167,10 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
vec_s16_t temp0v, temp1v, temp2v, temp3v;
vec_s32_t satdv;
- PREP_LOAD_SRC( pix1 );
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
@@ -192,10 +182,10 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
VEC_ADD_ABS( temp2v, satdv, satdv );
VEC_ADD_ABS( temp3v, satdv, satdv );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
@@ -229,15 +219,10 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
-
- PREP_LOAD_SRC( pix1 );
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
@@ -283,19 +268,14 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
- vec_u8_t _offset1_1v_ = vec_lvsl(0, pix1);
- vec_u8_t _offset1_2v_ = vec_lvsl(0, pix1 + i_pix1);
- vec_u8_t _offset2_1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2_2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset1_2v, offset2_2v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset1_2v, offset2_2v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset1_2v, offset2_2v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset1_2v, offset2_2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
@@ -343,18 +323,14 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
- PREP_LOAD_SRC( pix1 );
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
@@ -376,14 +352,14 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
VEC_ADD_ABS( temp6v, satdv, satdv );
VEC_ADD_ABS( temp7v, satdv, satdv );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
@@ -421,8 +397,6 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
ALIGNED_16( int i_satd );
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( pix2 );
vec_s32_t satdv;
vec_s16_t pix1v, pix2v;
vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
@@ -505,7 +479,6 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
ALIGNED_16( int i_satd );
LOAD_ZERO;
- PREP_LOAD;
vec_s32_t satdv;
vec_s16_t pix1v, pix2v;
vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
@@ -514,8 +487,6 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
diffl4v, diffl5v, diffl6v, diffl7v;
vec_s16_t temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v;
- PREP_LOAD_SRC( pix2 );
-
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
@@ -639,11 +610,7 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- //vec_u8_t perm0v, perm1v, perm2v, perm3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
@@ -651,39 +618,21 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld( 0, pix3 );
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -691,27 +640,19 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld( 0, pix3 );
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -750,65 +691,42 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv; // temporary load vectors
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -842,10 +760,7 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
@@ -853,39 +768,21 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
- fencv = vec_ld(0, fenc);
+ fencv = vec_ld( 0, fenc );
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld( 0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld( 0, pix3 );
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -893,27 +790,19 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -952,64 +841,41 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -1046,10 +912,7 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
@@ -1057,41 +920,21 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -1099,28 +942,19 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -1159,67 +993,41 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -1255,10 +1063,7 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
@@ -1266,41 +1071,21 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -1308,28 +1093,19 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -1368,67 +1144,41 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB, permEncv;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
@@ -1466,17 +1216,10 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
vec_u32_t sumv;
vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
- vec_u8_t temp_lv, temp_hv;
- vec_u8_t permA, permB;
sumv = vec_splat_u32(0);
- permA = vec_lvsl(0, pix2);
- permB = vec_lvsl(0, pix2 + i_stride_pix2);
-
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vA = vec_perm(temp_lv, temp_hv, permA);
+ pix2vA = vec_vsx_ld(0, pix2);
pix1vA = vec_ld(0, pix1);
for( int y = 0; y < 7; y++ )
@@ -1487,9 +1230,7 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
maxA = vec_max(pix1vA, pix2vA);
minA = vec_min(pix1vA, pix2vA);
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vB = vec_perm(temp_lv, temp_hv, permB);
+ pix2vB = vec_vsx_ld(0, pix2);
pix1vB = vec_ld(0, pix1);
diffA = vec_sub(maxA, minA);
@@ -1501,9 +1242,7 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
maxB = vec_max(pix1vB, pix2vB);
minB = vec_min(pix1vB, pix2vB);
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vA = vec_perm(temp_lv, temp_hv, permA);
+ pix2vA = vec_vsx_ld(0, pix2);
pix1vA = vec_ld(0, pix1);
diffB = vec_sub(maxB, minB);
@@ -1513,9 +1252,7 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
pix1 += i_stride_pix1;
pix2 += i_stride_pix2;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vB = vec_perm(temp_lv, temp_hv, permB);
+ pix2vB = vec_vsx_ld(0, pix2);
pix1vB = vec_ld(0, pix1);
maxA = vec_max(pix1vA, pix2vA);
@@ -1545,25 +1282,15 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
vec_u8_t pix1v, pix2v;
vec_u32_t sumv;
vec_u8_t maxv, minv, diffv;
- vec_u8_t temp_lv, temp_hv;
- vec_u8_t perm1v, perm2v;
const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
sumv = vec_splat_u32(0);
- perm1v = vec_lvsl(0, pix1);
- perm2v = vec_lvsl(0, pix2);
-
for( int y = 0; y < 8; y++ )
{
- temp_hv = vec_ld(0, pix1);
- temp_lv = vec_ld(7, pix1);
- pix1v = vec_perm(temp_hv, temp_lv, perm1v);
-
- temp_hv = vec_ld(0, pix2);
- temp_lv = vec_ld(7, pix2);
- pix2v = vec_perm(temp_hv, temp_lv, perm2v);
+ pix1v = vec_vsx_ld( 0, pix1);
+ pix2v = vec_vsx_ld( 0, pix2);
maxv = vec_max(pix1v, pix2v);
minv = vec_min(pix1v, pix2v);
@@ -1719,20 +1446,18 @@ static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
int32_t i_satd=0;
PREP_DIFF;
- PREP_LOAD_SRC( pix1 );
- PREP_LOAD_SRC( pix2 );
vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
@@ -1952,17 +1677,14 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1,
vec_u8_t pix1v, pix2v;
vec_u32_t s1v, s2v, ssv, s12v;
- PREP_LOAD;
- PREP_LOAD_SRC (pix1);
- PREP_LOAD_SRC (pix2);
LOAD_ZERO;
s1v = s2v = ssv = s12v = zero_u32v;
for( int y = 0; y < 4; y++ )
{
- VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
- VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );
+ pix1v = vec_vsx_ld( y*stride1, pix1 );
+ pix2v = vec_vsx_ld( y*stride2, pix2 );
s1v = vec_sum4s( pix1v, s1v );
s2v = vec_sum4s( pix2v, s2v );
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index bd5fd25..c656afe 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -121,89 +121,20 @@ typedef union {
#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
-
-/***********************************************************************
- * PREP_LOAD: declares two vectors required to perform unaligned loads
- * VEC_LOAD: loads n bytes from u8 * p into vector v of type t where o is from original src offset
- * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known
- * VEC_LOAD_OFFSET: as above, but with offset vector known in advance
- **********************************************************************/
-#define PREP_LOAD \
- vec_u8_t _hv, _lv
-
-#define PREP_LOAD_SRC( src ) \
- vec_u8_t _##src##_ = vec_lvsl(0, src)
-
-#define VEC_LOAD_G( p, v, n, t ) \
- _hv = vec_ld( 0, p ); \
- v = (t) vec_lvsl( 0, p ); \
- _lv = vec_ld( n - 1, p ); \
- v = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
-
-#define VEC_LOAD( p, v, n, t, g ) \
- _hv = vec_ld( 0, p ); \
- _lv = vec_ld( n - 1, p ); \
- v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ )
-
-#define VEC_LOAD_OFFSET( p, v, n, t, o ) \
- _hv = vec_ld( 0, p); \
- _lv = vec_ld( n - 1, p ); \
- v = (t) vec_perm( _hv, _lv, (vec_u8_t) o )
-
-#define VEC_LOAD_PARTIAL( p, v, n, t, g) \
- _hv = vec_ld( 0, p); \
- v = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
-
-
/***********************************************************************
* PREP_STORE##n: declares required vectors to store n bytes to a
* potentially unaligned address
* VEC_STORE##n: stores n bytes from vector v to address p
**********************************************************************/
-#define PREP_STORE16 \
- vec_u8_t _tmp1v \
-
-#define PREP_STORE16_DST( dst ) \
- vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \
- vec_u8_t _##dst##r_ = vec_lvsr(0, dst);
-
-#define VEC_STORE16( v, p, o ) \
- _hv = vec_ld( 0, p ); \
- _lv = vec_ld( 15, p ); \
- _tmp1v = vec_perm( _lv, _hv, _##o##l_ ); \
- _lv = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
- vec_st( _lv, 15, (uint8_t *) p ); \
- _hv = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
- vec_st( _hv, 0, (uint8_t *) p )
-
+#define PREP_STORE8 \
+ vec_u8_t _tmp3v; \
+ vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+ 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F } \
-#define PREP_STORE8 \
- vec_u8_t _tmp3v \
-
-#define VEC_STORE8( v, p ) \
- _tmp3v = vec_lvsl(0, p); \
- v = vec_perm(v, v, _tmp3v); \
- vec_ste((vec_u32_t)v,0,(uint32_t*)p); \
- vec_ste((vec_u32_t)v,4,(uint32_t*)p)
-
-
-#define PREP_STORE4 \
- PREP_STORE16; \
- vec_u8_t _tmp2v, _tmp3v; \
- const vec_u8_t sel = \
- (vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0)
-
-#define VEC_STORE4( v, p ) \
- _tmp3v = vec_lvsr( 0, p ); \
- v = vec_perm( v, v, _tmp3v ); \
- _lv = vec_ld( 3, p ); \
- _tmp1v = vec_perm( sel, zero_u8v, _tmp3v ); \
- _lv = vec_sel( _lv, v, _tmp1v ); \
- vec_st( _lv, 3, p ); \
- _hv = vec_ld( 0, p ); \
- _tmp2v = vec_perm( zero_u8v, sel, _tmp3v ); \
- _hv = vec_sel( _hv, v, _tmp2v ); \
- vec_st( _hv, 0, p )
+#define VEC_STORE8( v, p ) \
+ _tmp3v = vec_vsx_ld( 0, p ); \
+ v = vec_perm( v, _tmp3v, mask ); \
+ vec_vsx_st(v, 0, p)
/***********************************************************************
* VEC_TRANSPOSE_8
@@ -270,29 +201,17 @@ typedef union {
**********************************************************************/
#define PREP_DIFF \
LOAD_ZERO; \
- PREP_LOAD; \
vec_s16_t pix1v, pix2v;
-
-#define VEC_DIFF_H(p1,i1,p2,i2,n,d,g) \
- VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \
- pix1v = vec_u8_to_s16( pix1v ); \
- VEC_LOAD( p2, pix2v, n, vec_s16_t, g); \
- pix2v = vec_u8_to_s16( pix2v ); \
- d = vec_sub( pix1v, pix2v ); \
- p1 += i1; \
- p2 += i2
-
-#define VEC_DIFF_H_OFFSET(p1,i1,p2,i2,n,d,g1,g2) \
- pix1v = (vec_s16_t)vec_perm( vec_ld( 0, p1 ), zero_u8v, _##g1##_ );\
+#define VEC_DIFF_H(p1,i1,p2,i2,n,d) \
+ pix1v = vec_vsx_ld( 0, (int16_t *)p1 ); \
pix1v = vec_u8_to_s16( pix1v ); \
- VEC_LOAD( p2, pix2v, n, vec_s16_t, g2); \
+ pix2v = vec_vsx_ld( 0, (int16_t *)p2 ); \
pix2v = vec_u8_to_s16( pix2v ); \
d = vec_sub( pix1v, pix2v ); \
p1 += i1; \
p2 += i2
-
/***********************************************************************
* VEC_DIFF_HL
***********************************************************************
@@ -308,7 +227,7 @@ typedef union {
pix1v = (vec_s16_t)vec_ld(0, p1); \
temp0v = vec_u8_to_s16_h( pix1v ); \
temp1v = vec_u8_to_s16_l( pix1v ); \
- VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \
+ pix2v = vec_vsx_ld( 0, (int16_t *)p2 ); \
temp2v = vec_u8_to_s16_h( pix2v ); \
temp3v = vec_u8_to_s16_l( pix2v ); \
dh = vec_sub( temp0v, temp2v ); \
@@ -332,13 +251,10 @@ typedef union {
LOAD_ZERO; \
vec_s16_t pix1v, pix2v; \
vec_u8_t pix1v8, pix2v8; \
-vec_u8_t permPix1, permPix2; \
-permPix1 = vec_lvsl(0, pix1); \
-permPix2 = vec_lvsl(0, pix2); \
#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d) \
-pix1v8 = vec_perm(vec_ld(0,p1), zero_u8v, permPix1); \
-pix2v8 = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \
+pix1v8 = vec_vsx_ld( 0,p1 ); \
+pix2v8 = vec_vsx_ld( 0, p2 ); \
pix1v = vec_u8_to_s16( pix1v8 ); \
pix2v = vec_u8_to_s16( pix2v8 ); \
d = vec_sub( pix1v, pix2v); \
--
2.9.2
More information about the x264-devel
mailing list