[x264-devel] x86: AVX-512 sub16x16_dct
Henrik Gramner
git at videolan.org
Mon Jun 26 21:58:54 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Jun 10 16:01:53 2017 +0200| [9034085265e5ca56e801c3efbf5c538fcc17c82b] | committer: Henrik Gramner
x86: AVX-512 sub16x16_dct
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9034085265e5ca56e801c3efbf5c538fcc17c82b
---
common/common.h | 2 +-
common/dct.c | 1 +
common/macroblock.c | 5 ++---
common/x86/dct-a.asm | 33 +++++++++++++++++++++++++++++----
common/x86/dct.h | 1 +
5 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/common/common.h b/common/common.h
index 33319838..867b2073 100644
--- a/common/common.h
+++ b/common/common.h
@@ -778,7 +778,7 @@ struct x264_t
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
- ALIGNED_64( pixel fdec_buf[52*FDEC_STRIDE] );
+ ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
diff --git a/common/dct.c b/common/dct.c
index 3e7aa39a..0d7f96de 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -716,6 +716,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
{
dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
+ dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
}
#endif //HAVE_MMX
diff --git a/common/macroblock.c b/common/macroblock.c
index 8dc9f975..61686715 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -532,16 +532,15 @@ void x264_macroblock_thread_init( x264_t *h )
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+ h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
if( CHROMA444 )
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
- h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
- h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
+ h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE;
}
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
- h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
}
}
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 31897d9a..42af7c63 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -47,10 +47,10 @@ cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4:
dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
%else
-dct_avx512: dd 0x00000000, 0x00000104, 0x0000014c, 0x00000048 ; bits 0-4: dct8x8_fenc
- dd 0x00000210, 0x00000314, 0x0000035c, 0x00000258 ; bits 5-9: dct8x8_fdec
- dd 0x00000021, 0x00000125, 0x0000016d, 0x00000069
- dd 0x00000231, 0x00000335, 0x0000037d, 0x00000279
+dct_avx512: dd 0x00000000, 0x00021104, 0x0006314c, 0x00042048 ; bits 0-4: dct8x8_fenc
+ dd 0x00008a10, 0x00029b14, 0x0006bb5c, 0x0004aa58 ; bits 5-9: dct8x8_fdec
+ dd 0x00004421, 0x00025525, 0x0006756d, 0x00046469 ; bits 10-13: dct16x16_fenc
+ dd 0x0000ce31, 0x0002df35, 0x0006ff7d, 0x0004ee79 ; bits 14-18: dct16x16_fdec
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
@@ -699,6 +699,31 @@ cglobal sub8x8_dct, 3,3
mova [r0], m0
mova [r0+64], m1
RET
+
+%macro SUB4x16_DCT_AVX512 2 ; dst, src
+ vpermd m1, m5, [r1+1*%2*64]
+ mova m3, [r2+2*%2*64]
+ vpermt2d m3, m6, [r2+2*%2*64+64]
+ call dct4x4x4_internal_avx512
+ mova [r0+%1*64 ], m0
+ mova [r0+%1*64+128], m1
+%endmacro
+
+cglobal sub16x16_dct
+ psrld m5, [dct_avx512], 10
+ mov eax, 0xaaaaaaaa
+ kmovd k1, eax
+ mov eax, 0xf0f0f0f0
+ kmovd k2, eax
+ PROLOGUE 3,3
+ pxor xm4, xm4
+ knotw k3, k2
+ psrld m6, m5, 4
+ SUB4x16_DCT_AVX512 0, 0
+ SUB4x16_DCT_AVX512 1, 1
+ SUB4x16_DCT_AVX512 4, 2
+ SUB4x16_DCT_AVX512 5, 3
+ RET
%endif ; HIGH_BIT_DEPTH
INIT_MMX
diff --git a/common/x86/dct.h b/common/x86/dct.h
index f605437a..c30b0daa 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -44,6 +44,7 @@ void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
More information about the x264-devel
mailing list