[x264-devel] x86: AVX-512 plane_copy_deinterleave_v210
Henrik Gramner
git at videolan.org
Mon May 22 00:03:47 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Apr 12 16:21:09 2017 +0200| [3081ffa1c540d1df05123e0fab1937985573ac78] | committer: Henrik Gramner
x86: AVX-512 plane_copy_deinterleave_v210
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=3081ffa1c540d1df05123e0fab1937985573ac78
---
common/frame.c | 2 +-
common/x86/mc-a2.asm | 110 +++++++++++++++++++++++++++++++--------------------
common/x86/mc-c.c | 26 +++++++-----
tools/checkasm.c | 2 +-
4 files changed, 87 insertions(+), 53 deletions(-)
diff --git a/common/frame.c b/common/frame.c
index a81e9b10..4d80cbb0 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -78,7 +78,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
int i_padv = PADV << PARAM_INTERLACED;
int align = 16;
#if ARCH_X86 || ARCH_X86_64
- if( h->param.cpu&X264_CPU_CACHELINE_64 )
+ if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
align = 64;
else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
align = 32;
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 45692ff5..5bf452ba 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -30,18 +30,15 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
-
-pw_1024: times 16 dw 1024
-filt_mul20: times 32 db 20
-filt_mul15: times 16 db 1, -5
-filt_mul51: times 16 db -5, 1
-hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+SECTION_RODATA 64
%if HIGH_BIT_DEPTH
-v210_mask: times 4 dq 0xc00ffc003ff003ff
-v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
-v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
+v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma
+ db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20,
+ db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62
+v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00
+v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15
+v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14
; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
@@ -58,6 +55,12 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
+pw_1024: times 16 dw 1024
+filt_mul20: times 32 db 20
+filt_mul15: times 16 db 1, -5
+filt_mul51: times 16 db -5, 1
+hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+
mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
@@ -1400,43 +1403,64 @@ cglobal plane_copy_deinterleave_v210, 7,7,7
%define org_w r6m
%define h dword r7m
%endif
- FIX_STRIDES r1, r3, r6d
- shl r5, 2
- add r0, r6
- add r2, r6
- neg r6
- mov src, r4
- mov org_w, r6
- mova m2, [v210_mask]
- mova m3, [v210_luma_shuf]
- mova m4, [v210_chroma_shuf]
- mova m5, [v210_mult] ; also functions as vpermd index for avx2
- pshufd m6, m5, q1102
-
+ FIX_STRIDES r1, r3, r6d
+ shl r5, 2
+ add r0, r6
+ add r2, r6
+ neg r6
+ mov src, r4
+ mov org_w, r6
+%if cpuflag(avx512)
+ vpbroadcastd m2, [v210_mask]
+ vpbroadcastd m3, [v210_shuf_avx512]
+ psrlw m3, 6 ; dw 0, 4
+ mova m4, [v210_shuf_avx512] ; luma
+ psrlw m5, m4, 8 ; chroma
+%else
+%if mmsize == 32
+ vbroadcasti128 m2, [v210_mask]
+ vbroadcasti128 m3, [v210_luma_shuf]
+ vbroadcasti128 m4, [v210_chroma_shuf]
+%else
+ mova m2, [v210_mask]
+ mova m3, [v210_luma_shuf]
+ mova m4, [v210_chroma_shuf]
+%endif
+ mova m5, [v210_mult] ; also functions as vpermd index for avx2
+ pshufd m6, m5, q1102
+%endif
ALIGN 16
.loop:
- movu m1, [r4]
- pandn m0, m2, m1
- pand m1, m2
- pshufb m0, m3
- pshufb m1, m4
- pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
- pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
+ movu m1, [r4]
+ pandn m0, m2, m1
+ pand m1, m2
+%if cpuflag(avx512)
+ psrld m0, 10
+ vpsrlvw m1, m3
+ mova m6, m0
+ vpermt2w m0, m4, m1
+ vpermt2w m1, m5, m6
+%else
+ pshufb m0, m3
+ pshufb m1, m4
+ pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
+ pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
%if mmsize == 32
- vpermd m0, m5, m0
- vpermd m1, m5, m1
+ vpermd m0, m5, m0
+ vpermd m1, m5, m1
%endif
- movu [r0+r6], m0
- movu [r2+r6], m1
- add r4, mmsize
- add r6, 3*mmsize/4
+%endif
+ movu [r0+r6], m0
+ movu [r2+r6], m1
+ add r4, mmsize
+ add r6, mmsize*3/4
jl .loop
- add r0, r1
- add r2, r3
- add src, r5
- mov r4, src
- mov r6, org_w
- dec h
+ add r0, r1
+ add r2, r3
+ add src, r5
+ mov r4, src
+ mov r6, org_w
+ dec h
jg .loop
RET
%endmacro ; PLANE_DEINTERLEAVE_V210
@@ -1461,6 +1485,8 @@ PLANE_DEINTERLEAVE_V210
INIT_YMM avx2
LOAD_DEINTERLEAVE_CHROMA
PLANE_DEINTERLEAVE_V210
+INIT_ZMM avx512
+PLANE_DEINTERLEAVE_V210
%else
INIT_XMM sse2
PLANE_DEINTERLEAVE_RGB
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index b7f508a2..a89a139d 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -123,15 +123,18 @@ void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
- uint16_t *dstv, intptr_t i_dstv,
- uint32_t *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
- uint16_t *dstv, intptr_t i_dstv,
- uint32_t *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
- uint16_t *dstv, intptr_t i_dstv,
- uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
@@ -689,6 +692,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512;
+ }
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4c84ade9..d963f774 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1605,7 +1605,7 @@ static int check_mc( int cpu_ref, int cpu_new )
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
- intptr_t dst_stride = ALIGN( w, 16 );
+ intptr_t dst_stride = ALIGN( w, 32 );
intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
intptr_t offv = dst_stride*h + 32;
memset( pbuf3, 0, 0x1000 );
More information about the x264-devel
mailing list