[x264-devel] x86: AVX-512 zigzag_interleave_8x8_cavlc
Henrik Gramner
git at videolan.org
Mon May 22 00:03:16 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Mar 26 18:29:37 2017 +0200| [c3a1d1d892a79bc460c7fc192b0bf7a32c2ce0b2] | committer: Henrik Gramner
x86: AVX-512 zigzag_interleave_8x8_cavlc
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c3a1d1d892a79bc460c7fc192b0bf7a32c2ce0b2
---
common/dct.c | 10 ++++++++
common/x86/dct-a.asm | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/dct.h | 9 ++++----
3 files changed, 80 insertions(+), 4 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index cd263c92..0452dcd0 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -1082,6 +1082,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_interlaced->interleave_8x8_cavlc =
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
+ }
#else
if( cpu&X264_CPU_MMX )
{
@@ -1105,6 +1110,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_interlaced->interleave_8x8_cavlc =
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
+ }
#endif // HIGH_BIT_DEPTH
#endif
#if !HIGH_BIT_DEPTH
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index ec696f49..412bade6 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -42,6 +42,10 @@ scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4:
dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2
dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
+cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1
+ dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2
+ dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
+ dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
%else
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
@@ -51,6 +55,10 @@ scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x
dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2
dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
+cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1
+ dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2
+ dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd
+ dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd
%endif
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
@@ -1974,6 +1982,43 @@ cglobal zigzag_scan_8x8_field, 2,2
mova [r0+2*64], m2
mova [r0+3*64], m3
RET
+
+cglobal zigzag_interleave_8x8_cavlc, 3,3
+ mova m0, [cavlc_shuf_avx512]
+ mova m1, [r1+0*64]
+ mova m2, [r1+1*64]
+ mova m3, [r1+2*64]
+ mova m4, [r1+3*64]
+ kxnorb k1, k1, k1
+ por m7, m1, m2
+ psrld m5, m0, 5
+ vpermi2d m0, m1, m2 ; a0 a1 b0 b1
+ vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4
+ psrld m6, m5, 5
+ vpermi2d m5, m3, m4 ; b2 b3 a2 a3
+ vptestmd k0, m7, m7
+ vpermt2d m1, m6, m2 ; c0 c1 d0 d1
+ psrld m6, 5
+ vpermt2d m3, m6, m4 ; d2 d3 c2 c3
+ vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3
+ vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3
+ vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3
+ vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3
+ mova [r0+0*64], m5
+ mova [r0+1*64], m2
+ mova [r0+2*64], m3
+ mova [r0+3*64], m4
+ kmovw r1d, k0
+ test r1d, 0x1111
+ setnz [r2]
+ test r1d, 0x2222
+ setnz [r2+1]
+ test r1d, 0x4444
+ setnz [r2+8]
+ test r1d, 0x8888
+ setnz [r2+9]
+ RET
+
%else ; !HIGH_BIT_DEPTH
INIT_YMM avx512
cglobal zigzag_scan_4x4_frame, 2,2
@@ -2005,4 +2050,24 @@ scan8_avx512:
cglobal zigzag_scan_8x8_field, 2,2
mova m0, [scan_field_avx512]
jmp scan8_avx512
+
+cglobal zigzag_interleave_8x8_cavlc, 3,3
+ mova m0, [cavlc_shuf_avx512]
+ mova m1, [r1]
+ mova m2, [r1+64]
+ psrlw m3, m0, 6
+ vpermi2w m0, m1, m2
+ vpermt2w m1, m3, m2
+ kxnorb k2, k2, k2
+ vptestmd k0, m0, m0
+ vptestmd k1, m1, m1
+ mova [r0], m0
+ mova [r0+64], m1
+ ktestw k2, k0
+ setnz [r2]
+ setnc [r2+1]
+ ktestw k2, k1
+ setnz [r2+8]
+ setnc [r2+9]
+ RET
%endif ; !HIGH_BIT_DEPTH
diff --git a/common/x86/dct.h b/common/x86/dct.h
index a11a6dcd..206fbeea 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -129,9 +129,10 @@ int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, u
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
-void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
More information about the x264-devel
mailing list