[x264-devel] x86: AVX-512 zigzag_scan_8x8_frame
Henrik Gramner
git at videolan.org
Mon May 22 00:03:03 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Mar 25 19:14:28 2017 +0100| [724a577237f27cdb0c0fd18ef8ed32d39430796b] | committer: Henrik Gramner
x86: AVX-512 zigzag_scan_8x8_frame
The vperm* instructions ignores unused bits, so we can pack the permutation
indices together to save cache and just use a shift to get the right values.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=724a577237f27cdb0c0fd18ef8ed32d39430796b
---
common/common.h | 2 +-
common/dct.c | 2 ++
common/x86/dct-a.asm | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++--
common/x86/dct.h | 11 ++++++-----
encoder/macroblock.c | 4 ++--
encoder/macroblock.h | 2 +-
tools/checkasm.c | 2 +-
7 files changed, 66 insertions(+), 12 deletions(-)
diff --git a/common/common.h b/common/common.h
index e14dec7d..b49506c2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -638,7 +638,7 @@ struct x264_t
ALIGNED_64( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
- ALIGNED_32( dctcoef luma8x8[12][64] );
+ ALIGNED_64( dctcoef luma8x8[12][64] );
ALIGNED_64( dctcoef luma4x4[16*3][16] );
} dct;
diff --git a/common/dct.c b/common/dct.c
index 8ebb9ba5..5c1b8b5c 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -989,6 +989,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
if( cpu&X264_CPU_AVX512 )
{
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
}
#endif // HAVE_MMX
#else
@@ -1033,6 +1034,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
if( cpu&X264_CPU_AVX512 )
{
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index ad457237..a9b853c4 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -31,10 +31,18 @@
%include "x86util.asm"
SECTION_RODATA 64
+; Permutation indices for AVX-512 zigzags are bit-packed to save cache
%if HIGH_BIT_DEPTH
-scan_frame_avx512: dd 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15
+scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame
+ dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1
+ dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2
+ dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
+ ; bits 19-23: 8x8_frame4
%else
-scan_frame_avx512: dw 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15
+scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
+ dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
+ dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
+ dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
%endif
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
@@ -1897,6 +1905,37 @@ cglobal zigzag_scan_4x4_frame, 2,2
vpermd m0, m0, [r1]
mova [r0], m0
RET
+
+cglobal zigzag_scan_8x8_frame, 2,2
+ psrld m0, [scan_frame_avx512], 4
+ mova m1, [r1+0*64]
+ mova m2, [r1+1*64]
+ mova m3, [r1+2*64]
+ mova m4, [r1+3*64]
+ mov r1d, 0x01fe7f80
+ kmovd k1, r1d
+ kshiftrd k2, k1, 16
+ vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40
+ psrld m6, m0, 5
+ vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __
+ vmovdqa64 m0 {k1}, m5
+ mova [r0+0*64], m0
+ mova m5, m1
+ vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __
+ psrld m0, m6, 5
+ vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35
+ vmovdqa32 m6 {k2}, m1
+ mova [r0+1*64], m6
+ vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30
+ psrld m1, m0, 5
+ vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __
+ vmovdqa32 m5 {k1}, m0
+ mova [r0+2*64], m5
+ vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63
+ vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __
+ vmovdqa64 m2 {k2}, m3
+ mova [r0+3*64], m2
+ RET
%else ; !HIGH_BIT_DEPTH
INIT_YMM avx512
cglobal zigzag_scan_4x4_frame, 2,2
@@ -1904,4 +1943,16 @@ cglobal zigzag_scan_4x4_frame, 2,2
vpermw m0, m0, [r1]
mova [r0], m0
RET
+
+INIT_ZMM avx512
+cglobal zigzag_scan_8x8_frame, 2,2
+ psrlw m0, [scan_frame_avx512], 4
+ mova m1, [r1]
+ mova m2, [r1+64]
+ psrlw m3, m0, 6
+ vpermi2w m0, m1, m2
+ vpermt2w m1, m3, m2
+ mova [r0], m0
+ mova [r0+64], m1
+ RET
%endif ; !HIGH_BIT_DEPTH
diff --git a/common/x86/dct.h b/common/x86/dct.h
index ce88c7e4..6254368b 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -101,11 +101,12 @@ void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
-void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 3684e257..af8462a7 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -780,7 +780,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else if( h->mb.b_transform_8x8 )
{
- ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
@@ -1219,7 +1219,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
int quant_cat = p ? CQM_8PC : CQM_8PY;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
- ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index db85539f..1c901a89 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -154,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
- ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( b_predict )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 02c84989..d09a06a4 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -842,7 +842,7 @@ static int check_dct( int cpu_ref, int cpu_new )
ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_t *h = &h_buf;
More information about the x264-devel
mailing list