[x264-devel] [PATCH 2/2] ppc: Add quant_4x4x4
Luca Barbato
lu_zero at gentoo.org
Fri Aug 17 22:28:45 CEST 2018
4x faster than C.
---
common/ppc/quant.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++
common/ppc/quant.h | 2 +
common/quant.c | 1 +
3 files changed, 204 insertions(+)
diff --git a/common/ppc/quant.c b/common/ppc/quant.c
index f69c54f1..45bd8388 100644
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -90,6 +90,207 @@ int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16]
return vec_any_ne(nz, zero_s16v);
}
+int x264_quant_4x4x4_altivec( dctcoef dcta[4][16], udctcoef mf[16], udctcoef bias[16] )
+{
+ LOAD_ZERO;
+ vec_u32_t i_qbitsv = vec_splats( (uint32_t)16 );
+ vec_s16_t one = vec_splat_s16(1);
+ vec_s16_t nz0, nz1, nz2, nz3;
+
+ vector bool short mskA0;
+ vec_u16_t coefvA0;
+ vec_u32_t multEvenvA0, multOddvA0;
+ vec_u16_t mfvA0;
+ vec_u16_t biasvA0;
+ vector bool short mskB0;
+ vec_u16_t coefvB0;
+ vec_u32_t multEvenvB0, multOddvB0;
+ vec_u16_t mfvB0;
+ vec_u16_t biasvB0;
+
+ vector bool short mskA1;
+ vec_u16_t coefvA1;
+ vec_u32_t multEvenvA1, multOddvA1;
+ vec_u16_t mfvA1;
+ vec_u16_t biasvA1;
+ vector bool short mskB1;
+ vec_u16_t coefvB1;
+ vec_u32_t multEvenvB1, multOddvB1;
+ vec_u16_t mfvB1;
+ vec_u16_t biasvB1;
+
+ vector bool short mskA2;
+ vec_u16_t coefvA2;
+ vec_u32_t multEvenvA2, multOddvA2;
+ vec_u16_t mfvA2;
+ vec_u16_t biasvA2;
+ vector bool short mskB2;
+ vec_u16_t coefvB2;
+ vec_u32_t multEvenvB2, multOddvB2;
+ vec_u16_t mfvB2;
+ vec_u16_t biasvB2;
+
+ vector bool short mskA3;
+ vec_u16_t coefvA3;
+ vec_u32_t multEvenvA3, multOddvA3;
+ vec_u16_t mfvA3;
+ vec_u16_t biasvA3;
+ vector bool short mskB3;
+ vec_u16_t coefvB3;
+ vec_u32_t multEvenvB3, multOddvB3;
+ vec_u16_t mfvB3;
+ vec_u16_t biasvB3;
+
+ vec_s16_t temp1v, temp2v;
+ vec_s16_t tmpv0;
+ vec_s16_t tmpv1;
+
+ dctcoef *dct0 = dcta[0];
+ dctcoef *dct1 = dcta[1];
+ dctcoef *dct2 = dcta[2];
+ dctcoef *dct3 = dcta[3];
+
+ temp1v = vec_ld(0, dct0);
+ temp2v = vec_ld(16, dct0);
+ mfvA0 = vec_ld(0, mf);
+ mfvB0 = vec_ld(16, mf);
+ biasvA0 = vec_ld(0, bias);
+ biasvB0 = vec_ld(16, bias);
+ mskA0 = vec_cmplt(temp1v, zero_s16v);
+ mskB0 = vec_cmplt(temp2v, zero_s16v);
+ coefvA0 = (vec_u16_t)vec_abs( temp1v );
+ coefvB0 = (vec_u16_t)vec_abs( temp2v );
+ temp1v = vec_ld(0, dct1);
+ temp2v = vec_ld(16, dct1);
+ mfvA1 = vec_ld(0, mf);
+ mfvB1 = vec_ld(16, mf);
+ biasvA1 = vec_ld(0, bias);
+ biasvB1 = vec_ld(16, bias);
+ mskA1 = vec_cmplt(temp1v, zero_s16v);
+ mskB1 = vec_cmplt(temp2v, zero_s16v);
+ coefvA1 = (vec_u16_t)vec_abs( temp1v );
+ coefvB1 = (vec_u16_t)vec_abs( temp2v );
+ temp1v = vec_ld(0, dct2);
+ temp2v = vec_ld(16, dct2);
+ mfvA2 = vec_ld(0, mf);
+ mfvB2 = vec_ld(16, mf);
+ biasvA2 = vec_ld(0, bias);
+ biasvB2 = vec_ld(16, bias);
+ mskA2 = vec_cmplt(temp1v, zero_s16v);
+ mskB2 = vec_cmplt(temp2v, zero_s16v);
+ coefvA2 = (vec_u16_t)vec_abs( temp1v );
+ coefvB2 = (vec_u16_t)vec_abs( temp2v );
+ temp1v = vec_ld(0, dct3);
+ temp2v = vec_ld(16, dct3);
+ mfvA3 = vec_ld(0, mf);
+ mfvB3 = vec_ld(16, mf);
+ biasvA3 = vec_ld(0, bias);
+ biasvB3 = vec_ld(16, bias);
+ mskA3 = vec_cmplt(temp1v, zero_s16v);
+ mskB3 = vec_cmplt(temp2v, zero_s16v);
+ coefvA3 = (vec_u16_t)vec_abs( temp1v );
+ coefvB3 = (vec_u16_t)vec_abs( temp2v );
+
+ coefvA0 = vec_adds(coefvA0, biasvA0);
+ coefvB0 = vec_adds(coefvB0, biasvB0);
+ coefvA1 = vec_adds(coefvA1, biasvA1);
+ coefvB1 = vec_adds(coefvB1, biasvB1);
+ coefvA2 = vec_adds(coefvA2, biasvA2);
+ coefvB2 = vec_adds(coefvB2, biasvB2);
+ coefvA3 = vec_adds(coefvA3, biasvA3);
+ coefvB3 = vec_adds(coefvB3, biasvB3);
+
+ multEvenvA0 = vec_mule(coefvA0, mfvA0);
+ multOddvA0 = vec_mulo(coefvA0, mfvA0);
+ multEvenvB0 = vec_mule(coefvB0, mfvB0);
+ multOddvB0 = vec_mulo(coefvB0, mfvB0);
+ multEvenvA0 = vec_sr(multEvenvA0, i_qbitsv);
+ multOddvA0 = vec_sr(multOddvA0, i_qbitsv);
+ multEvenvB0 = vec_sr(multEvenvB0, i_qbitsv);
+ multOddvB0 = vec_sr(multOddvB0, i_qbitsv);
+ temp1v = (vec_s16_t) vec_packs( multEvenvA0, multOddvA0 );
+ temp2v = (vec_s16_t) vec_packs( multEvenvB0, multOddvB0 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor(temp1v, mskA0);
+ temp2v = vec_xor(temp2v, mskB0);
+ temp1v = vec_adds(temp1v, vec_and(mskA0, one));
+ temp2v = vec_adds(temp2v, vec_and(mskB0, one));
+ vec_st(temp1v, 0, dct0);
+ vec_st(temp2v, 16, dct0);
+ nz0 = vec_or(temp1v, temp2v);
+
+ multEvenvA1 = vec_mule(coefvA1, mfvA1);
+ multOddvA1 = vec_mulo(coefvA1, mfvA1);
+ multEvenvB1 = vec_mule(coefvB1, mfvB1);
+ multOddvB1 = vec_mulo(coefvB1, mfvB1);
+ multEvenvA1 = vec_sr(multEvenvA1, i_qbitsv);
+ multOddvA1 = vec_sr(multOddvA1, i_qbitsv);
+ multEvenvB1 = vec_sr(multEvenvB1, i_qbitsv);
+ multOddvB1 = vec_sr(multOddvB1, i_qbitsv);
+ temp1v = (vec_s16_t) vec_packs( multEvenvA1, multOddvA1 );
+ temp2v = (vec_s16_t) vec_packs( multEvenvB1, multOddvB1 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor(temp1v, mskA1);
+ temp2v = vec_xor(temp2v, mskB1);
+ temp1v = vec_adds(temp1v, vec_and(mskA1, one));
+ temp2v = vec_adds(temp2v, vec_and(mskB1, one));
+ vec_st(temp1v, 0, dct1);
+ vec_st(temp2v, 16, dct1);
+ nz1 = vec_or(temp1v, temp2v);
+
+ multEvenvA2 = vec_mule(coefvA2, mfvA2);
+ multOddvA2 = vec_mulo(coefvA2, mfvA2);
+ multEvenvB2 = vec_mule(coefvB2, mfvB2);
+ multOddvB2 = vec_mulo(coefvB2, mfvB2);
+ multEvenvA2 = vec_sr(multEvenvA2, i_qbitsv);
+ multOddvA2 = vec_sr(multOddvA2, i_qbitsv);
+ multEvenvB2 = vec_sr(multEvenvB2, i_qbitsv);
+ multOddvB2 = vec_sr(multOddvB2, i_qbitsv);
+ temp1v = (vec_s16_t) vec_packs( multEvenvA2, multOddvA2 );
+ temp2v = (vec_s16_t) vec_packs( multEvenvB2, multOddvB2 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor(temp1v, mskA2);
+ temp2v = vec_xor(temp2v, mskB2);
+ temp1v = vec_adds(temp1v, vec_and(mskA2, one));
+ temp2v = vec_adds(temp2v, vec_and(mskB2, one));
+ vec_st(temp1v, 0, dct2);
+ vec_st(temp2v, 16, dct2);
+ nz2 = vec_or(temp1v, temp2v);
+
+ multEvenvA3 = vec_mule(coefvA3, mfvA3);
+ multOddvA3 = vec_mulo(coefvA3, mfvA3);
+ multEvenvB3 = vec_mule(coefvB3, mfvB3);
+ multOddvB3 = vec_mulo(coefvB3, mfvB3);
+ multEvenvA3 = vec_sr(multEvenvA3, i_qbitsv);
+ multOddvA3 = vec_sr(multOddvA3, i_qbitsv);
+ multEvenvB3 = vec_sr(multEvenvB3, i_qbitsv);
+ multOddvB3 = vec_sr(multOddvB3, i_qbitsv);
+ temp1v = (vec_s16_t) vec_packs( multEvenvA3, multOddvA3 );
+ temp2v = (vec_s16_t) vec_packs( multEvenvB3, multOddvB3 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor(temp1v, mskA3);
+ temp2v = vec_xor(temp2v, mskB3);
+ temp1v = vec_adds(temp1v, vec_and(mskA3, one));
+ temp2v = vec_adds(temp2v, vec_and(mskB3, one));
+ vec_st(temp1v, 0, dct3);
+ vec_st(temp2v, 16, dct3);
+ nz3 = vec_or(temp1v, temp2v);
+
+ return vec_any_ne(nz0, zero_s16v) | (vec_any_ne(nz1, zero_s16v)<<1) | (vec_any_ne(nz2, zero_s16v)<<2) | (vec_any_ne(nz3, zero_s16v)<<3);
+}
+
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( idx0, idx1 ) \
{ \
diff --git a/common/ppc/quant.h b/common/ppc/quant.h
index dc6536af..780e3f91 100644
--- a/common/ppc/quant.h
+++ b/common/ppc/quant.h
@@ -26,6 +26,8 @@
#ifndef X264_PPC_QUANT_H
#define X264_PPC_QUANT_H
+#define x264_quant_4x4x4_altivec x264_template(quant_4x4x4_altivec)
+int x264_quant_4x4x4_altivec( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_4x4_altivec x264_template(quant_4x4_altivec)
int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_8x8_altivec x264_template(quant_8x8_altivec)
diff --git a/common/quant.c b/common/quant.c
index 92013e15..1e92575c 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -741,6 +741,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
pf->quant_4x4 = x264_quant_4x4_altivec;
+ pf->quant_4x4x4 = x264_quant_4x4x4_altivec;
pf->quant_8x8 = x264_quant_8x8_altivec;
pf->dequant_4x4 = x264_dequant_4x4_altivec;
--
2.12.2
More information about the x264-devel
mailing list