[x264-devel] ppc: AltiVec sub8x8_dct_dc
Alexandra Hájková
git at videolan.org
Thu Dec 1 21:01:45 CET 2016
x264 | branch: master | Alexandra Hájková <alexandra at khirnov.net> | Mon Nov 14 15:06:06 2016 +0100| [99863c665a6d4ec58b7fcc4a8a791e9c8f35a86e] | committer: Henrik Gramner
ppc: AltiVec sub8x8_dct_dc
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=99863c665a6d4ec58b7fcc4a8a791e9c8f35a86e
---
common/dct.c | 1 +
common/ppc/dct.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
common/ppc/dct.h | 1 +
3 files changed, 56 insertions(+)
diff --git a/common/dct.c b/common/dct.c
index d59c2db..4052e3e 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -726,6 +726,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct = x264_add8x8_idct_altivec;
dctf->add16x16_idct = x264_add16x16_idct_altivec;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_altivec;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
diff --git a/common/ppc/dct.c b/common/ppc/dct.c
index d0fdfed..dd424dd 100644
--- a/common/ppc/dct.c
+++ b/common/ppc/dct.c
@@ -113,6 +113,60 @@ void x264_sub16x16_dct_altivec( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix
* 8x8 transform:
***************************************************************************/
+static void pix_diff( uint8_t *p1, uint8_t *p2, vec_s16_t *diff, int i )
+{
+ vec_s16_t pix1v, pix2v, tmp[4];
+ vec_u8_t pix1v8, pix2v8;
+ LOAD_ZERO;
+
+ for( int j = 0; j < 4; j++ )
+ {
+ pix1v8 = vec_vsx_ld( 0, p1 );
+ pix2v8 = vec_vsx_ld( 0, p2 );
+ pix1v = vec_u8_to_s16_h( pix1v8 );
+ pix2v = vec_u8_to_s16_h( pix2v8 );
+ tmp[j] = vec_sub( pix1v, pix2v );
+ p1 += FENC_STRIDE;
+ p2 += FDEC_STRIDE;
+ }
+ diff[i] = vec_add( tmp[0], tmp[1] );
+ diff[i] = vec_add( diff[i], tmp[2] );
+ diff[i] = vec_add( diff[i], tmp[3] );
+}
+
+void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
+{
+ vec_s16_t diff[2];
+ vec_s32_t sum[2];
+ vec_s32_t zero32 = vec_splat_s32(0);
+ vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };
+
+ pix_diff( &pix1[0], &pix2[0], diff, 0 );
+ pix_diff( &pix1[4*FENC_STRIDE], &pix2[4*FDEC_STRIDE], diff, 1 );
+
+ sum[0] = vec_sum4s( diff[0], zero32 );
+ sum[1] = vec_sum4s( diff[1], zero32 );
+ diff[0] = vec_packs( sum[0], sum[1] );
+ sum[0] = vec_sum4s( diff[0], zero32 );
+ diff[0] = vec_packs( sum[0], zero32 );
+
+ diff[1] = vec_vsx_ld( 0, dct );
+ diff[0] = vec_perm( diff[0], diff[1], mask );
+
+ vec_vsx_st( diff[0], 0, dct );
+
+ /* 2x2 DC transform */
+ int d0 = dct[0] + dct[1];
+ int d1 = dct[2] + dct[3];
+ int d2 = dct[0] - dct[1];
+ int d3 = dct[2] - dct[3];
+ dct[0] = d0 + d1;
+ dct[1] = d0 - d1;
+ dct[2] = d2 + d3;
+ dct[3] = d2 - d3;
+}
+
/* DCT8_1D unrolled by 8 in Altivec */
#define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \
{ \
diff --git a/common/ppc/dct.h b/common/ppc/dct.h
index 4011b8f..7e01bae 100644
--- a/common/ppc/dct.h
+++ b/common/ppc/dct.h
@@ -37,6 +37,7 @@ void x264_add4x4_idct_altivec( uint8_t *p_dst, int16_t dct[16] );
void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] );
void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] );
+void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
More information about the x264-devel
mailing list