[x264-devel] [PATCH 2/2] ppc: Add quant_4x4x4

Luca Barbato lu_zero at gentoo.org
Fri Aug 17 22:28:45 CEST 2018


4x faster than C.
---
 common/ppc/quant.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 common/ppc/quant.h |   2 +
 common/quant.c     |   1 +
 3 files changed, 204 insertions(+)

diff --git a/common/ppc/quant.c b/common/ppc/quant.c
index f69c54f1..45bd8388 100644
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -90,6 +90,207 @@ int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16]
     return vec_any_ne(nz, zero_s16v);
 }
 
+int x264_quant_4x4x4_altivec( dctcoef dcta[4][16], udctcoef mf[16], udctcoef bias[16] )
+{
+    LOAD_ZERO;
+    vec_u32_t i_qbitsv = vec_splats( (uint32_t)16 );
+    vec_s16_t one = vec_splat_s16(1);
+    vec_s16_t nz0, nz1, nz2, nz3;
+
+    vector bool short mskA0;
+    vec_u16_t coefvA0;
+    vec_u32_t multEvenvA0, multOddvA0;
+    vec_u16_t mfvA0;
+    vec_u16_t biasvA0;
+    vector bool short mskB0;
+    vec_u16_t coefvB0;
+    vec_u32_t multEvenvB0, multOddvB0;
+    vec_u16_t mfvB0;
+    vec_u16_t biasvB0;
+
+    vector bool short mskA1;
+    vec_u16_t coefvA1;
+    vec_u32_t multEvenvA1, multOddvA1;
+    vec_u16_t mfvA1;
+    vec_u16_t biasvA1;
+    vector bool short mskB1;
+    vec_u16_t coefvB1;
+    vec_u32_t multEvenvB1, multOddvB1;
+    vec_u16_t mfvB1;
+    vec_u16_t biasvB1;
+
+    vector bool short mskA2;
+    vec_u16_t coefvA2;
+    vec_u32_t multEvenvA2, multOddvA2;
+    vec_u16_t mfvA2;
+    vec_u16_t biasvA2;
+    vector bool short mskB2;
+    vec_u16_t coefvB2;
+    vec_u32_t multEvenvB2, multOddvB2;
+    vec_u16_t mfvB2;
+    vec_u16_t biasvB2;
+
+    vector bool short mskA3;
+    vec_u16_t coefvA3;
+    vec_u32_t multEvenvA3, multOddvA3;
+    vec_u16_t mfvA3;
+    vec_u16_t biasvA3;
+    vector bool short mskB3;
+    vec_u16_t coefvB3;
+    vec_u32_t multEvenvB3, multOddvB3;
+    vec_u16_t mfvB3;
+    vec_u16_t biasvB3;
+
+    vec_s16_t temp1v, temp2v;
+    vec_s16_t tmpv0;
+    vec_s16_t tmpv1;
+
+    dctcoef *dct0 = dcta[0];
+    dctcoef *dct1 = dcta[1];
+    dctcoef *dct2 = dcta[2];
+    dctcoef *dct3 = dcta[3];
+
+    temp1v = vec_ld(0, dct0);
+    temp2v = vec_ld(16, dct0);
+    mfvA0 = vec_ld(0, mf);
+    mfvB0 = vec_ld(16, mf);
+    biasvA0 = vec_ld(0, bias);
+    biasvB0 = vec_ld(16, bias);
+    mskA0 = vec_cmplt(temp1v, zero_s16v);
+    mskB0 = vec_cmplt(temp2v, zero_s16v);
+    coefvA0 = (vec_u16_t)vec_abs( temp1v );
+    coefvB0 = (vec_u16_t)vec_abs( temp2v );
+    temp1v = vec_ld(0, dct1);
+    temp2v = vec_ld(16, dct1);
+    mfvA1 = vec_ld(0, mf);
+    mfvB1 = vec_ld(16, mf);
+    biasvA1 = vec_ld(0, bias);
+    biasvB1 = vec_ld(16, bias);
+    mskA1 = vec_cmplt(temp1v, zero_s16v);
+    mskB1 = vec_cmplt(temp2v, zero_s16v);
+    coefvA1 = (vec_u16_t)vec_abs( temp1v );
+    coefvB1 = (vec_u16_t)vec_abs( temp2v );
+    temp1v = vec_ld(0, dct2);
+    temp2v = vec_ld(16, dct2);
+    mfvA2 = vec_ld(0, mf);
+    mfvB2 = vec_ld(16, mf);
+    biasvA2 = vec_ld(0, bias);
+    biasvB2 = vec_ld(16, bias);
+    mskA2 = vec_cmplt(temp1v, zero_s16v);
+    mskB2 = vec_cmplt(temp2v, zero_s16v);
+    coefvA2 = (vec_u16_t)vec_abs( temp1v );
+    coefvB2 = (vec_u16_t)vec_abs( temp2v );
+    temp1v = vec_ld(0, dct3);
+    temp2v = vec_ld(16, dct3);
+    mfvA3 = vec_ld(0, mf);
+    mfvB3 = vec_ld(16, mf);
+    biasvA3 = vec_ld(0, bias);
+    biasvB3 = vec_ld(16, bias);
+    mskA3 = vec_cmplt(temp1v, zero_s16v);
+    mskB3 = vec_cmplt(temp2v, zero_s16v);
+    coefvA3 = (vec_u16_t)vec_abs( temp1v );
+    coefvB3 = (vec_u16_t)vec_abs( temp2v );
+
+    coefvA0 = vec_adds(coefvA0, biasvA0);
+    coefvB0 = vec_adds(coefvB0, biasvB0);
+    coefvA1 = vec_adds(coefvA1, biasvA1);
+    coefvB1 = vec_adds(coefvB1, biasvB1);
+    coefvA2 = vec_adds(coefvA2, biasvA2);
+    coefvB2 = vec_adds(coefvB2, biasvB2);
+    coefvA3 = vec_adds(coefvA3, biasvA3);
+    coefvB3 = vec_adds(coefvB3, biasvB3);
+
+    multEvenvA0 = vec_mule(coefvA0, mfvA0);
+    multOddvA0 = vec_mulo(coefvA0, mfvA0);
+    multEvenvB0 = vec_mule(coefvB0, mfvB0);
+    multOddvB0 = vec_mulo(coefvB0, mfvB0);
+    multEvenvA0 = vec_sr(multEvenvA0, i_qbitsv);
+    multOddvA0 = vec_sr(multOddvA0, i_qbitsv);
+    multEvenvB0 = vec_sr(multEvenvB0, i_qbitsv);
+    multOddvB0 = vec_sr(multOddvB0, i_qbitsv);
+    temp1v = (vec_s16_t) vec_packs( multEvenvA0, multOddvA0 );
+    temp2v = (vec_s16_t) vec_packs( multEvenvB0, multOddvB0 );
+    tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+    tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+    temp1v = vec_mergeh( temp1v, tmpv0 );
+    temp2v = vec_mergeh( temp2v, tmpv1 );
+    temp1v = vec_xor(temp1v, mskA0);
+    temp2v = vec_xor(temp2v, mskB0);
+    temp1v = vec_adds(temp1v, vec_and(mskA0, one));
+    temp2v = vec_adds(temp2v, vec_and(mskB0, one));
+    vec_st(temp1v, 0, dct0);
+    vec_st(temp2v, 16, dct0);
+    nz0 = vec_or(temp1v, temp2v);
+
+    multEvenvA1 = vec_mule(coefvA1, mfvA1);
+    multOddvA1 = vec_mulo(coefvA1, mfvA1);
+    multEvenvB1 = vec_mule(coefvB1, mfvB1);
+    multOddvB1 = vec_mulo(coefvB1, mfvB1);
+    multEvenvA1 = vec_sr(multEvenvA1, i_qbitsv);
+    multOddvA1 = vec_sr(multOddvA1, i_qbitsv);
+    multEvenvB1 = vec_sr(multEvenvB1, i_qbitsv);
+    multOddvB1 = vec_sr(multOddvB1, i_qbitsv);
+    temp1v = (vec_s16_t) vec_packs( multEvenvA1, multOddvA1 );
+    temp2v = (vec_s16_t) vec_packs( multEvenvB1, multOddvB1 );
+    tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+    tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+    temp1v = vec_mergeh( temp1v, tmpv0 );
+    temp2v = vec_mergeh( temp2v, tmpv1 );
+    temp1v = vec_xor(temp1v, mskA1);
+    temp2v = vec_xor(temp2v, mskB1);
+    temp1v = vec_adds(temp1v, vec_and(mskA1, one));
+    temp2v = vec_adds(temp2v, vec_and(mskB1, one));
+    vec_st(temp1v, 0, dct1);
+    vec_st(temp2v, 16, dct1);
+    nz1 = vec_or(temp1v, temp2v);
+
+    multEvenvA2 = vec_mule(coefvA2, mfvA2);
+    multOddvA2 = vec_mulo(coefvA2, mfvA2);
+    multEvenvB2 = vec_mule(coefvB2, mfvB2);
+    multOddvB2 = vec_mulo(coefvB2, mfvB2);
+    multEvenvA2 = vec_sr(multEvenvA2, i_qbitsv);
+    multOddvA2 = vec_sr(multOddvA2, i_qbitsv);
+    multEvenvB2 = vec_sr(multEvenvB2, i_qbitsv);
+    multOddvB2 = vec_sr(multOddvB2, i_qbitsv);
+    temp1v = (vec_s16_t) vec_packs( multEvenvA2, multOddvA2 );
+    temp2v = (vec_s16_t) vec_packs( multEvenvB2, multOddvB2 );
+    tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+    tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+    temp1v = vec_mergeh( temp1v, tmpv0 );
+    temp2v = vec_mergeh( temp2v, tmpv1 );
+    temp1v = vec_xor(temp1v, mskA2);
+    temp2v = vec_xor(temp2v, mskB2);
+    temp1v = vec_adds(temp1v, vec_and(mskA2, one));
+    temp2v = vec_adds(temp2v, vec_and(mskB2, one));
+    vec_st(temp1v, 0, dct2);
+    vec_st(temp2v, 16, dct2);
+    nz2 = vec_or(temp1v, temp2v);
+
+    multEvenvA3 = vec_mule(coefvA3, mfvA3);
+    multOddvA3 = vec_mulo(coefvA3, mfvA3);
+    multEvenvB3 = vec_mule(coefvB3, mfvB3);
+    multOddvB3 = vec_mulo(coefvB3, mfvB3);
+    multEvenvA3 = vec_sr(multEvenvA3, i_qbitsv);
+    multOddvA3 = vec_sr(multOddvA3, i_qbitsv);
+    multEvenvB3 = vec_sr(multEvenvB3, i_qbitsv);
+    multOddvB3 = vec_sr(multOddvB3, i_qbitsv);
+    temp1v = (vec_s16_t) vec_packs( multEvenvA3, multOddvA3 );
+    temp2v = (vec_s16_t) vec_packs( multEvenvB3, multOddvB3 );
+    tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+    tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+    temp1v = vec_mergeh( temp1v, tmpv0 );
+    temp2v = vec_mergeh( temp2v, tmpv1 );
+    temp1v = vec_xor(temp1v, mskA3);
+    temp2v = vec_xor(temp2v, mskB3);
+    temp1v = vec_adds(temp1v, vec_and(mskA3, one));
+    temp2v = vec_adds(temp2v, vec_and(mskB3, one));
+    vec_st(temp1v, 0, dct3);
+    vec_st(temp2v, 16, dct3);
+    nz3 = vec_or(temp1v, temp2v);
+
+    return vec_any_ne(nz0, zero_s16v) | (vec_any_ne(nz1, zero_s16v)<<1) | (vec_any_ne(nz2, zero_s16v)<<2) | (vec_any_ne(nz3, zero_s16v)<<3);
+}
+
 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
 #define QUANT_16_U_DC( idx0, idx1 )                                 \
 {                                                                   \
diff --git a/common/ppc/quant.h b/common/ppc/quant.h
index dc6536af..780e3f91 100644
--- a/common/ppc/quant.h
+++ b/common/ppc/quant.h
@@ -26,6 +26,8 @@
 #ifndef X264_PPC_QUANT_H
 #define X264_PPC_QUANT_H
 
+#define x264_quant_4x4x4_altivec x264_template(quant_4x4x4_altivec)
+int x264_quant_4x4x4_altivec( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
 #define x264_quant_4x4_altivec x264_template(quant_4x4_altivec)
 int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
 #define x264_quant_8x8_altivec x264_template(quant_8x8_altivec)
diff --git a/common/quant.c b/common/quant.c
index 92013e15..1e92575c 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -741,6 +741,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
         pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
         pf->quant_4x4 = x264_quant_4x4_altivec;
+        pf->quant_4x4x4 = x264_quant_4x4x4_altivec;
         pf->quant_8x8 = x264_quant_8x8_altivec;
 
         pf->dequant_4x4 = x264_dequant_4x4_altivec;
-- 
2.12.2



More information about the x264-devel mailing list