[x264-devel] ppc: Add quant_4x4x4
Luca Barbato
git at videolan.org
Tue Mar 12 19:31:49 CET 2019
x264 | branch: master | Luca Barbato <lu_zero at gentoo.org> | Fri Aug 17 22:28:45 2018 +0200| [4dd83955b282e722fbeb3f4ee5cc05a45dc54c7f] | committer: Anton Mitrofanov
ppc: Add quant_4x4x4
4x faster than C.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=4dd83955b282e722fbeb3f4ee5cc05a45dc54c7f
---
common/ppc/quant.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++++
common/ppc/quant.h | 2 +
common/quant.c | 1 +
3 files changed, 205 insertions(+)
diff --git a/common/ppc/quant.c b/common/ppc/quant.c
index f69c54f1..d72f42b5 100644
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -90,6 +90,208 @@ int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16]
return vec_any_ne(nz, zero_s16v);
}
+int x264_quant_4x4x4_altivec( dctcoef dcta[4][16], udctcoef mf[16], udctcoef bias[16] )
+{
+ LOAD_ZERO;
+ vec_u32_t i_qbitsv = vec_splats( (uint32_t)16 );
+ vec_s16_t one = vec_splat_s16( 1 );
+ vec_s16_t nz0, nz1, nz2, nz3;
+
+ vector bool short mskA0;
+ vec_u16_t coefvA0;
+ vec_u32_t multEvenvA0, multOddvA0;
+ vec_u16_t mfvA0;
+ vec_u16_t biasvA0;
+ vector bool short mskB0;
+ vec_u16_t coefvB0;
+ vec_u32_t multEvenvB0, multOddvB0;
+ vec_u16_t mfvB0;
+ vec_u16_t biasvB0;
+
+ vector bool short mskA1;
+ vec_u16_t coefvA1;
+ vec_u32_t multEvenvA1, multOddvA1;
+ vec_u16_t mfvA1;
+ vec_u16_t biasvA1;
+ vector bool short mskB1;
+ vec_u16_t coefvB1;
+ vec_u32_t multEvenvB1, multOddvB1;
+ vec_u16_t mfvB1;
+ vec_u16_t biasvB1;
+
+ vector bool short mskA2;
+ vec_u16_t coefvA2;
+ vec_u32_t multEvenvA2, multOddvA2;
+ vec_u16_t mfvA2;
+ vec_u16_t biasvA2;
+ vector bool short mskB2;
+ vec_u16_t coefvB2;
+ vec_u32_t multEvenvB2, multOddvB2;
+ vec_u16_t mfvB2;
+ vec_u16_t biasvB2;
+
+ vector bool short mskA3;
+ vec_u16_t coefvA3;
+ vec_u32_t multEvenvA3, multOddvA3;
+ vec_u16_t mfvA3;
+ vec_u16_t biasvA3;
+ vector bool short mskB3;
+ vec_u16_t coefvB3;
+ vec_u32_t multEvenvB3, multOddvB3;
+ vec_u16_t mfvB3;
+ vec_u16_t biasvB3;
+
+ vec_s16_t temp1v, temp2v;
+ vec_s16_t tmpv0;
+ vec_s16_t tmpv1;
+
+ dctcoef *dct0 = dcta[0];
+ dctcoef *dct1 = dcta[1];
+ dctcoef *dct2 = dcta[2];
+ dctcoef *dct3 = dcta[3];
+
+ temp1v = vec_ld( 0, dct0 );
+ temp2v = vec_ld( 16, dct0 );
+ mfvA0 = vec_ld( 0, mf );
+ mfvB0 = vec_ld( 16, mf );
+ biasvA0 = vec_ld( 0, bias );
+ biasvB0 = vec_ld( 16, bias );
+ mskA0 = vec_cmplt( temp1v, zero_s16v );
+ mskB0 = vec_cmplt( temp2v, zero_s16v );
+ coefvA0 = (vec_u16_t)vec_abs( temp1v );
+ coefvB0 = (vec_u16_t)vec_abs( temp2v );
+ temp1v = vec_ld( 0, dct1 );
+ temp2v = vec_ld( 16, dct1 );
+ mfvA1 = vec_ld( 0, mf );
+ mfvB1 = vec_ld( 16, mf );
+ biasvA1 = vec_ld( 0, bias );
+ biasvB1 = vec_ld( 16, bias );
+ mskA1 = vec_cmplt( temp1v, zero_s16v );
+ mskB1 = vec_cmplt( temp2v, zero_s16v );
+ coefvA1 = (vec_u16_t)vec_abs( temp1v );
+ coefvB1 = (vec_u16_t)vec_abs( temp2v );
+ temp1v = vec_ld( 0, dct2 );
+ temp2v = vec_ld( 16, dct2 );
+ mfvA2 = vec_ld( 0, mf );
+ mfvB2 = vec_ld( 16, mf );
+ biasvA2 = vec_ld( 0, bias );
+ biasvB2 = vec_ld( 16, bias );
+ mskA2 = vec_cmplt( temp1v, zero_s16v );
+ mskB2 = vec_cmplt( temp2v, zero_s16v );
+ coefvA2 = (vec_u16_t)vec_abs( temp1v );
+ coefvB2 = (vec_u16_t)vec_abs( temp2v );
+ temp1v = vec_ld( 0, dct3 );
+ temp2v = vec_ld( 16, dct3 );
+ mfvA3 = vec_ld( 0, mf );
+ mfvB3 = vec_ld( 16, mf );
+ biasvA3 = vec_ld( 0, bias );
+ biasvB3 = vec_ld( 16, bias );
+ mskA3 = vec_cmplt( temp1v, zero_s16v );
+ mskB3 = vec_cmplt( temp2v, zero_s16v );
+ coefvA3 = (vec_u16_t)vec_abs( temp1v );
+ coefvB3 = (vec_u16_t)vec_abs( temp2v );
+
+ coefvA0 = vec_adds( coefvA0, biasvA0 );
+ coefvB0 = vec_adds( coefvB0, biasvB0 );
+ coefvA1 = vec_adds( coefvA1, biasvA1 );
+ coefvB1 = vec_adds( coefvB1, biasvB1 );
+ coefvA2 = vec_adds( coefvA2, biasvA2 );
+ coefvB2 = vec_adds( coefvB2, biasvB2 );
+ coefvA3 = vec_adds( coefvA3, biasvA3 );
+ coefvB3 = vec_adds( coefvB3, biasvB3 );
+
+ multEvenvA0 = vec_mule( coefvA0, mfvA0 );
+ multOddvA0 = vec_mulo( coefvA0, mfvA0 );
+ multEvenvB0 = vec_mule( coefvB0, mfvB0 );
+ multOddvB0 = vec_mulo( coefvB0, mfvB0 );
+ multEvenvA0 = vec_sr( multEvenvA0, i_qbitsv );
+ multOddvA0 = vec_sr( multOddvA0, i_qbitsv );
+ multEvenvB0 = vec_sr( multEvenvB0, i_qbitsv );
+ multOddvB0 = vec_sr( multOddvB0, i_qbitsv );
+ temp1v = (vec_s16_t)vec_packs( multEvenvA0, multOddvA0 );
+ temp2v = (vec_s16_t)vec_packs( multEvenvB0, multOddvB0 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor( temp1v, mskA0 );
+ temp2v = vec_xor( temp2v, mskB0 );
+ temp1v = vec_adds( temp1v, vec_and( mskA0, one ) );
+ temp2v = vec_adds( temp2v, vec_and( mskB0, one ) );
+ vec_st( temp1v, 0, dct0 );
+ vec_st( temp2v, 16, dct0 );
+ nz0 = vec_or( temp1v, temp2v );
+
+ multEvenvA1 = vec_mule( coefvA1, mfvA1 );
+ multOddvA1 = vec_mulo( coefvA1, mfvA1 );
+ multEvenvB1 = vec_mule( coefvB1, mfvB1 );
+ multOddvB1 = vec_mulo( coefvB1, mfvB1 );
+ multEvenvA1 = vec_sr( multEvenvA1, i_qbitsv );
+ multOddvA1 = vec_sr( multOddvA1, i_qbitsv );
+ multEvenvB1 = vec_sr( multEvenvB1, i_qbitsv );
+ multOddvB1 = vec_sr( multOddvB1, i_qbitsv );
+ temp1v = (vec_s16_t)vec_packs( multEvenvA1, multOddvA1 );
+ temp2v = (vec_s16_t)vec_packs( multEvenvB1, multOddvB1 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor( temp1v, mskA1 );
+ temp2v = vec_xor( temp2v, mskB1 );
+ temp1v = vec_adds( temp1v, vec_and( mskA1, one ) );
+ temp2v = vec_adds( temp2v, vec_and( mskB1, one ) );
+ vec_st( temp1v, 0, dct1 );
+ vec_st( temp2v, 16, dct1 );
+ nz1 = vec_or( temp1v, temp2v );
+
+ multEvenvA2 = vec_mule( coefvA2, mfvA2 );
+ multOddvA2 = vec_mulo( coefvA2, mfvA2 );
+ multEvenvB2 = vec_mule( coefvB2, mfvB2 );
+ multOddvB2 = vec_mulo( coefvB2, mfvB2 );
+ multEvenvA2 = vec_sr( multEvenvA2, i_qbitsv );
+ multOddvA2 = vec_sr( multOddvA2, i_qbitsv );
+ multEvenvB2 = vec_sr( multEvenvB2, i_qbitsv );
+ multOddvB2 = vec_sr( multOddvB2, i_qbitsv );
+ temp1v = (vec_s16_t)vec_packs( multEvenvA2, multOddvA2 );
+ temp2v = (vec_s16_t)vec_packs( multEvenvB2, multOddvB2 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor( temp1v, mskA2 );
+ temp2v = vec_xor( temp2v, mskB2 );
+ temp1v = vec_adds( temp1v, vec_and( mskA2, one ) );
+ temp2v = vec_adds( temp2v, vec_and( mskB2, one ) );
+ vec_st( temp1v, 0, dct2 );
+ vec_st( temp2v, 16, dct2 );
+ nz2 = vec_or( temp1v, temp2v );
+
+ multEvenvA3 = vec_mule( coefvA3, mfvA3 );
+ multOddvA3 = vec_mulo( coefvA3, mfvA3 );
+ multEvenvB3 = vec_mule( coefvB3, mfvB3 );
+ multOddvB3 = vec_mulo( coefvB3, mfvB3 );
+ multEvenvA3 = vec_sr( multEvenvA3, i_qbitsv );
+ multOddvA3 = vec_sr( multOddvA3, i_qbitsv );
+ multEvenvB3 = vec_sr( multEvenvB3, i_qbitsv );
+ multOddvB3 = vec_sr( multOddvB3, i_qbitsv );
+ temp1v = (vec_s16_t)vec_packs( multEvenvA3, multOddvA3 );
+ temp2v = (vec_s16_t)vec_packs( multEvenvB3, multOddvB3 );
+ tmpv0 = xxpermdi( temp1v, temp1v, 2 );
+ tmpv1 = xxpermdi( temp2v, temp2v, 2 );
+ temp1v = vec_mergeh( temp1v, tmpv0 );
+ temp2v = vec_mergeh( temp2v, tmpv1 );
+ temp1v = vec_xor( temp1v, mskA3 );
+ temp2v = vec_xor( temp2v, mskB3 );
+ temp1v = vec_adds( temp1v, vec_and( mskA3, one ) );
+ temp2v = vec_adds( temp2v, vec_and( mskB3, one ) );
+ vec_st( temp1v, 0, dct3 );
+ vec_st( temp2v, 16, dct3 );
+ nz3 = vec_or( temp1v, temp2v );
+
+ return (vec_any_ne( nz0, zero_s16v ) << 0) | (vec_any_ne( nz1, zero_s16v ) << 1) |
+ (vec_any_ne( nz2, zero_s16v ) << 2) | (vec_any_ne( nz3, zero_s16v ) << 3);
+}
+
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( idx0, idx1 ) \
{ \
diff --git a/common/ppc/quant.h b/common/ppc/quant.h
index dc6536af..780e3f91 100644
--- a/common/ppc/quant.h
+++ b/common/ppc/quant.h
@@ -26,6 +26,8 @@
#ifndef X264_PPC_QUANT_H
#define X264_PPC_QUANT_H
+#define x264_quant_4x4x4_altivec x264_template(quant_4x4x4_altivec)
+int x264_quant_4x4x4_altivec( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_4x4_altivec x264_template(quant_4x4_altivec)
int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_8x8_altivec x264_template(quant_8x8_altivec)
diff --git a/common/quant.c b/common/quant.c
index 92013e15..1e92575c 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -741,6 +741,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
pf->quant_4x4 = x264_quant_4x4_altivec;
+ pf->quant_4x4x4 = x264_quant_4x4x4_altivec;
pf->quant_8x8 = x264_quant_8x8_altivec;
pf->dequant_4x4 = x264_dequant_4x4_altivec;
More information about the x264-devel
mailing list