[x264-devel] Re: [PATCH] Altivec optimizations for quant4x4, quant4x4dc, quant8x8, sub8x8_dct8, sub16x16_dct8
Guillaume POIRIER
poirierg at gmail.com
Sun Aug 27 19:54:48 CEST 2006
Hi there,
On 8/26/06, Guillaume POIRIER <poirierg at gmail.com> wrote:
[..]
> Here is an updated patch that puts back the comments on the macro, and
> uses the macro of x264 to do the 8x8 transpose.
>
> More comments welcome, and thanks Loren for your feedback and suggestions.
The attached patch adds sub16x16_dct8 to the bunch of optimized codes.
I realize that I have not advertised the speed-up:
All functions get a 3.3x to 3.5x speed-up, except quant4x4 which get a
far bigger speed-up, but I lost the sheet on which I had it written
down (I think it was a 7x speed-up, but not sure).
Overall, on an encode with lost of options, RD, and stuff, I get a
2.5% speed-up, but encodes with more straightforward options should
get an even bigger overall speed-up.
Next step: x264_pixel_sa8d_8x8 and friends (20% through for now).
Guillaume
--
A thing is not necessarily true because a man dies for it.
-- Oscar Wilde
-------------- next part --------------
Index: encoder/macroblock.c
===================================================================
--- encoder/macroblock.c (revision 558)
+++ encoder/macroblock.c (working copy)
@@ -31,7 +31,7 @@
/* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
/* FIXME this seems to make better decisions with cqm=jvt, but could screw up
* with general custom matrices. */
-static const int def_quant4_mf[6][4][4] =
+static const int def_quant4_mf[6][4][4] __attribute__((__aligned__(16))) =
{
{ { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
{ 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
@@ -195,7 +195,7 @@
int y = 4 * block_idx_y[idx];
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
- int16_t dct4x4[4][4];
+ DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
if( h->mb.b_lossless )
{
@@ -223,7 +223,7 @@
int y = 8 * (idx>>1);
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
- int16_t dct8x8[8][8];
+ DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
@@ -242,7 +242,7 @@
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
- int16_t dct4x4[16+1][4][4];
+ DECLARE_ALIGNED( int16_t, dct4x4[16+1][4][4], 16 );
int i;
@@ -305,8 +305,8 @@
uint8_t *p_dst = h->mb.pic.p_fdec[1+ch];
int i_decimate_score = 0;
- int16_t dct2x2[2][2];
- int16_t dct4x4[4][4][4];
+ DECLARE_ALIGNED( int16_t, dct2x2[2][2] , 16 );
+ DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
if( h->mb.b_lossless )
{
@@ -494,7 +494,7 @@
}
else if( h->mb.b_transform_8x8 )
{
- int16_t dct8x8[4][8][8];
+ DECLARE_ALIGNED( int16_t, dct8x8[4][8][8], 16 );
int nnz8x8[4] = {1,1,1,1};
b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
@@ -539,7 +539,7 @@
}
else
{
- int16_t dct4x4[16][4][4];
+ DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
int nnz8x8[4] = {1,1,1,1};
h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
@@ -869,7 +869,7 @@
if( h->mb.b_transform_8x8 )
{
- int16_t dct8x8[8][8];
+ DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
@@ -888,7 +888,7 @@
else
{
int i4;
- int16_t dct4x4[4][4][4];
+ DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
@@ -919,7 +919,7 @@
for( ch = 0; ch < 2; ch++ )
{
- int16_t dct4x4[4][4];
+ DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
Index: common/ppc/dct.h
===================================================================
--- common/ppc/dct.h (revision 558)
+++ common/ppc/dct.h (working copy)
@@ -5,6 +5,7 @@
* $Id$
*
* Authors: Eric Petit <titer at m0k.org>
+ * Guillaume Poirier <gpoirier at mplayerhq.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -31,4 +32,9 @@
void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_altivec( int16_t dct[8][8],
+ uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8],
+ uint8_t *pix1, uint8_t *pix2 );
+
#endif
Index: common/ppc/dct.c
===================================================================
--- common/ppc/dct.c (revision 558)
+++ common/ppc/dct.c (working copy)
@@ -5,6 +5,7 @@
* $Id$
*
* Authors: Eric Petit <titer at m0k.org>
+ * Guillaume Poirier <gpoirier at mplayerhq.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -250,3 +251,118 @@
VEC_STORE8_L( temp6v, dct[15][2] );
VEC_STORE8_L( temp7v, dct[15][3] );
}
+
+/****************************************************************************
+* 8x8 transform:
+****************************************************************************/
+
+/* DCT8_1D unrolled by 8 in Altivec */
+#define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \
+{ \
+ /* int s07 = SRC(0) + SRC(7); */ \
+ vec_s16_t s07v = vec_add( dct0v, dct7v); \
+ /* int s16 = SRC(1) + SRC(6); */ \
+ vec_s16_t s16v = vec_add( dct1v, dct6v); \
+ /* int s25 = SRC(2) + SRC(5); */ \
+ vec_s16_t s25v = vec_add( dct2v, dct5v); \
+ /* int s34 = SRC(3) + SRC(4); */ \
+ vec_s16_t s34v = vec_add( dct3v, dct4v); \
+\
+ /* int a0 = s07 + s34; */ \
+ vec_s16_t a0v = vec_add(s07v, s34v); \
+ /* int a1 = s16 + s25; */ \
+ vec_s16_t a1v = vec_add(s16v, s25v); \
+ /* int a2 = s07 - s34; */ \
+ vec_s16_t a2v = vec_sub(s07v, s34v); \
+ /* int a3 = s16 - s25; */ \
+ vec_s16_t a3v = vec_sub(s16v, s25v); \
+\
+ /* int d07 = SRC(0) - SRC(7); */ \
+ vec_s16_t d07v = vec_sub( dct0v, dct7v); \
+ /* int d16 = SRC(1) - SRC(6); */ \
+ vec_s16_t d16v = vec_sub( dct1v, dct6v); \
+ /* int d25 = SRC(2) - SRC(5); */ \
+ vec_s16_t d25v = vec_sub( dct2v, dct5v); \
+ /* int d34 = SRC(3) - SRC(4); */ \
+ vec_s16_t d34v = vec_sub( dct3v, dct4v); \
+\
+ /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \
+ vec_s16_t a4v = vec_add( vec_add(d16v, d25v), vec_add(d07v, vec_sra(d07v, onev)) );\
+ /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \
+ vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v), vec_add(d25v, vec_sra(d25v, onev)) );\
+ /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \
+ vec_s16_t a6v = vec_sub( vec_add(d07v, d34v), vec_add(d16v, vec_sra(d16v, onev)) );\
+ /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \
+ vec_s16_t a7v = vec_add( vec_sub(d16v, d25v), vec_add(d34v, vec_sra(d34v, onev)) );\
+\
+ /* DST(0) = a0 + a1; */ \
+ dct0v = vec_add( a0v, a1v ); \
+ /* DST(1) = a4 + (a7>>2); */ \
+ dct1v = vec_add( a4v, vec_sra(a7v, twov) ); \
+ /* DST(2) = a2 + (a3>>1); */ \
+ dct2v = vec_add( a2v, vec_sra(a3v, onev) ); \
+ /* DST(3) = a5 + (a6>>2); */ \
+ dct3v = vec_add( a5v, vec_sra(a6v, twov) ); \
+ /* DST(4) = a0 - a1; */ \
+ dct4v = vec_sub( a0v, a1v ); \
+ /* DST(5) = a6 - (a5>>2); */ \
+ dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); \
+ /* DST(6) = (a2>>1) - a3 ; */ \
+ dct6v = vec_sub( vec_sra(a2v, onev), a3v ); \
+ /* DST(7) = (a4>>2) - a7 ; */ \
+ dct7v = vec_sub( vec_sra(a4v, twov), a7v ); \
+}
+
+
+void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+{
+ vec_s16_t onev = vec_splat_s16(1);
+ vec_s16_t twov = vec_add( onev, onev );
+
+ PREP_DIFF;
+
+ vec_s16_t dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v;
+
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
+
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
+ VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
+
+ DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v );
+
+ vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;
+
+ VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v,
+ dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
+
+ DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
+
+ vec_st( dct_tr0v, 0, dct );
+ vec_st( dct_tr1v, 16, dct );
+ vec_st( dct_tr2v, 32, dct );
+ vec_st( dct_tr3v, 48, dct );
+
+ vec_st( dct_tr4v, 64, dct );
+ vec_st( dct_tr5v, 80, dct );
+ vec_st( dct_tr6v, 96, dct );
+ vec_st( dct_tr7v, 112, dct );
+}
+
+void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
+{
+ x264_sub8x8_dct8_altivec( dct[0], &pix1[0], &pix2[0] );
+ x264_sub8x8_dct8_altivec( dct[1], &pix1[8], &pix2[8] );
+ x264_sub8x8_dct8_altivec( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
+ x264_sub8x8_dct8_altivec( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
+}
Index: common/quant.c
===================================================================
--- common/quant.c (revision 558)
+++ common/quant.c (working copy)
@@ -25,6 +25,9 @@
#ifdef HAVE_MMXEXT
#include "i386/quant.h"
#endif
+#ifdef ARCH_PPC
+# include "ppc/quant.h"
+#endif
#define QUANT_ONE( coef, mf ) \
{ \
@@ -271,4 +274,39 @@
pf->dequant_8x8 = x264_dequant_8x8_mmx;
}
#endif /* HAVE_MMXEXT */
+
+#ifdef ARCH_PPC
+ if( cpu&X264_CPU_ALTIVEC )
+ {
+ /* determine the biggest coeffient in all quant8_mf tables */
+ for( i = 0; i < 2*6*8*8; i++ )
+ {
+ int q = h->quant8_mf[0][0][0][i];
+ if( maxQ8 < q )
+ maxQ8 = q;
+ }
+
+ for( i = 0; i < 4*6*4*4; i++ )
+ {
+ int q = h->quant4_mf[0][0][0][i];
+ if( maxQ4 < q )
+ maxQ4 = q;
+ if( maxQdc < q && i%16 == 0 )
+ maxQdc = q;
+ }
+
+ if( maxQ8 < (1<<16) )
+ {
+ pf->quant_8x8_core = x264_quant_8x8_altivec;
+ }
+ if( maxQ4 < (1<<16) )
+ {
+ pf->quant_4x4_core = x264_quant_4x4_altivec;
+ }
+ if( maxQdc < (1<<15) )
+ {
+ pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec;
+ }
+ }
+#endif
}
Index: common/dct.c
===================================================================
--- common/dct.c (revision 558)
+++ common/dct.c (working copy)
@@ -437,6 +437,9 @@
dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
+
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
}
#endif
}
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c (revision 558)
+++ tools/checkasm.c (working copy)
@@ -453,8 +453,9 @@
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
- int16_t dct1[64], dct2[64];
- uint8_t cqm_buf[64];
+ int16_t dct1[64] __attribute__((__aligned__(16)));
+ int16_t dct2[64] __attribute__((__aligned__(16)));
+ uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
int ret = 0, ok, used_asm;
int oks[2] = {1,1}, used_asms[2] = {0,0};
int i, i_cqm;
Index: Makefile
===================================================================
--- Makefile (revision 558)
+++ Makefile (working copy)
@@ -43,7 +43,8 @@
# AltiVec optims
ifeq ($(ARCH),PPC)
-SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c
+SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
+ common/ppc/quant.c
endif
# VIS optims
More information about the x264-devel
mailing list