[x264-devel] x86: AVX-512 pixel_sad_x3 and pixel_sad_x4
Henrik Gramner
git at videolan.org
Mon May 22 00:04:24 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed May 10 18:36:59 2017 +0200| [4579616543f2e701ee9510f5eb57e31a3ef99e10] | committer: Henrik Gramner
x86: AVX-512 pixel_sad_x3 and pixel_sad_x4
Covers all variants: 4x4, 4x8, 8x4, 8x8, 8x16, 16x8, and 16x16.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=4579616543f2e701ee9510f5eb57e31a3ef99e10
---
common/pixel.c | 2 +
common/x86/pixel.h | 1 +
common/x86/sad-a.asm | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 222 insertions(+)
diff --git a/common/pixel.c b/common/pixel.c
index 23e9c7a1..14891d71 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1350,6 +1350,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
INIT8( sad, _avx512 );
INIT8_NAME( sad_aligned, sad, _avx512 );
+ INIT7( sad_x3, _avx512 );
+ INIT7( sad_x4, _avx512 );
INIT8( satd, _avx512 );
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512;
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 9bf71134..56cfc5cb 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -60,6 +60,7 @@ DECL_X4( sad, ssse3 )
DECL_X4( sad, xop )
DECL_X4( sad, avx )
DECL_X4( sad, avx2 )
+DECL_X4( sad, avx512 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 6b248a4d..8029e113 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1617,6 +1617,225 @@ SAD_X_AVX2 3, 16, 8, 7
SAD_X_AVX2 4, 16, 16, 8
SAD_X_AVX2 4, 16, 8, 8
+%macro SAD_X_W4_AVX512 2 ; x, h
+cglobal pixel_sad_x%1_4x%2, %1+2,%1+3
+ mov t1d, 0xa
+ kmovb k1, t1d
+ lea t1, [3*t0]
+ kaddb k2, k1, k1
+ kshiftlb k3, k1, 2
+%assign %%i 0
+%rep %2/4
+ movu m6, [r0+%%i*64]
+ vmovddup m6 {k1}, [r0+%%i*64+32]
+ movd xmm2, [r1]
+ movd xmm4, [r1+t0]
+ vpbroadcastd xmm2 {k1}, [r1+2*t0]
+ vpbroadcastd xmm4 {k1}, [r1+t1]
+ vpbroadcastd xmm2 {k2}, [r2+t0]
+ vpbroadcastd xmm4 {k2}, [r2]
+ vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3
+ vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2
+ vpmovqd s1, m6 ; s0 s2 s1 s3
+ movd xmm3, [r3]
+ movd xmm5, [r3+t0]
+ vpbroadcastd xmm3 {k1}, [r3+2*t0]
+ vpbroadcastd xmm5 {k1}, [r3+t1]
+%if %1 == 4
+ vpbroadcastd xmm3 {k2}, [r4+t0]
+ vpbroadcastd xmm5 {k2}, [r4]
+ vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3
+ vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2
+%endif
+%if %%i != %2/4-1
+%assign %%j 1
+%rep %1
+ lea r%+%%j, [r%+%%j+4*t0]
+ %assign %%j %%j+1
+%endrep
+%endif
+ pshufd s2, s1, q1032
+ psadbw xmm2, s1
+ psadbw xmm4, s2
+ psadbw xmm3, s1
+ psadbw xmm5, s2
+%if %%i
+ paddd xmm0, xmm2
+ paddd xmm1, xmm3
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+%else
+ paddd xmm0, xmm2, xmm4
+ paddd xmm1, xmm3, xmm5
+%endif
+ %assign %%i %%i+1
+%endrep
+%if %1 == 4
+ movifnidn t2, r6mp
+%else
+ movifnidn t2, r5mp
+%endif
+ packusdw xmm0, xmm1
+ mova [t2], xmm0
+ RET
+%endmacro
+
+%macro SAD_X_W8_AVX512 2 ; x, h
+cglobal pixel_sad_x%1_8x%2, %1+2,%1+3
+ kxnorb k3, k3, k3
+ lea t1, [3*t0]
+ kaddb k1, k3, k3
+ kshiftlb k2, k3, 2
+ kshiftlb k3, k3, 3
+%assign %%i 0
+%rep %2/4
+ movddup m6, [r0+%%i*64] ; s0 s0 s1 s1
+ movq xm2, [r1]
+ movq xm4, [r1+2*t0]
+ vpbroadcastq xm2 {k1}, [r2]
+ vpbroadcastq xm4 {k1}, [r2+2*t0]
+ vpbroadcastq m2 {k2}, [r1+t0]
+ vpbroadcastq m4 {k2}, [r1+t1]
+ vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1
+ vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3
+ movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3
+ movq xm3, [r3]
+ movq xm5, [r3+2*t0]
+%if %1 == 4
+ vpbroadcastq xm3 {k1}, [r4]
+ vpbroadcastq xm5 {k1}, [r4+2*t0]
+%endif
+ vpbroadcastq m3 {k2}, [r3+t0]
+ vpbroadcastq m5 {k2}, [r3+t1]
+%if %1 == 4
+ vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1
+ vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3
+%endif
+%if %%i != %2/4-1
+%assign %%j 1
+%rep %1
+ lea r%+%%j, [r%+%%j+4*t0]
+ %assign %%j %%j+1
+%endrep
+%endif
+ psadbw m2, m6
+ psadbw m4, m7
+ psadbw m3, m6
+ psadbw m5, m7
+ ACCUM paddd, 0, 2, %%i
+ ACCUM paddd, 1, 3, %%i
+ paddd m0, m4
+ paddd m1, m5
+ %assign %%i %%i+1
+%endrep
+%if %1 == 4
+ movifnidn t2, r6mp
+%else
+ movifnidn t2, r5mp
+%endif
+ packusdw m0, m1
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ mova [t2], xm0
+ RET
+%endmacro
+
+%macro SAD_X_W16_AVX512 2 ; x, h
+cglobal pixel_sad_x%1_16x%2, %1+2,%1+3
+ lea t1, [3*t0]
+%assign %%i 0
+%rep %2/4
+ mova m6, [r0+%%i*64] ; s0 s1 s2 s3
+ movu xm2, [r3]
+ movu xm4, [r3+t0]
+%if %1 == 4
+ vinserti128 ym2, [r4+t0], 1
+ vinserti128 ym4, [r4], 1
+%endif
+ vinserti32x4 m2, [r1+2*t0], 2
+ vinserti32x4 m4, [r1+t1], 2
+ vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3
+ vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2
+ vpermq m7, m6, q1032 ; s1 s0 s3 s2
+ movu xm3, [r1]
+ movu xm5, [r1+t0]
+ vinserti128 ym3, [r2+t0], 1
+ vinserti128 ym5, [r2], 1
+ vinserti32x4 m3, [r3+2*t0], 2
+ vinserti32x4 m5, [r3+t1], 2
+%if %1 == 4
+ vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3
+ vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2
+%endif
+%if %%i != %2/4-1
+%assign %%j 1
+%rep %1
+ lea r%+%%j, [r%+%%j+4*t0]
+ %assign %%j %%j+1
+%endrep
+%endif
+ psadbw m2, m6
+ psadbw m4, m7
+ psadbw m3, m6
+ psadbw m5, m7
+ ACCUM paddd, 0, 2, %%i
+ ACCUM paddd, 1, 3, %%i
+ paddd m0, m4
+ paddd m1, m5
+ %assign %%i %%i+1
+%endrep
+%if %1 == 4
+ movifnidn t2, r6mp
+%else
+ movifnidn t2, r5mp
+%endif
+ mov t1d, 0x1111
+ kmovw k1, t1d
+ vshufi32x4 m0, m0, q1032
+ paddd m0, m1
+ punpckhqdq m1, m0, m0
+ paddd m0, m1
+ vpcompressd m0 {k1}{z}, m0
+ mova [t2], xm0
+ RET
+%endmacro
+
+; t0 = stride, t1 = tmp/stride3, t2 = scores
+%if WIN64
+ %define s1 xmm16 ; xmm6 and xmm7 reduces code size, but
+ %define s2 xmm17 ; they're callee-saved on win64
+ DECLARE_REG_TMP 4, 6, 0
+%else
+ %define s1 xmm6
+ %define s2 xmm7
+%if ARCH_X86_64
+ DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64
+%else
+ DECLARE_REG_TMP 4, 5, 0
+%endif
+%endif
+
+INIT_YMM avx512
+SAD_X_W4_AVX512 3, 4 ; x3_4x4
+SAD_X_W4_AVX512 3, 8 ; x3_4x8
+SAD_X_W8_AVX512 3, 4 ; x3_8x4
+SAD_X_W8_AVX512 3, 8 ; x3_8x8
+SAD_X_W8_AVX512 3, 16 ; x3_8x16
+INIT_ZMM avx512
+SAD_X_W16_AVX512 3, 8 ; x3_16x8
+SAD_X_W16_AVX512 3, 16 ; x3_16x16
+
+DECLARE_REG_TMP 5, 6, 0
+INIT_YMM avx512
+SAD_X_W4_AVX512 4, 4 ; x4_4x4
+SAD_X_W4_AVX512 4, 8 ; x4_4x8
+SAD_X_W8_AVX512 4, 4 ; x4_8x4
+SAD_X_W8_AVX512 4, 8 ; x4_8x8
+SAD_X_W8_AVX512 4, 16 ; x4_8x16
+INIT_ZMM avx512
+SAD_X_W16_AVX512 4, 8 ; x4_16x8
+SAD_X_W16_AVX512 4, 16 ; x4_16x16
+
;=============================================================================
; SAD cacheline split
;=============================================================================
More information about the x264-devel
mailing list