[x264-devel] x86: AVX-512 pixel_satd
Henrik Gramner
git at videolan.org
Mon May 22 00:03:55 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Thu Apr 13 23:56:04 2017 +0200| [386050088a66aa66bcaebb9b6b4b0a2b6af76a73] | committer: Henrik Gramner
x86: AVX-512 pixel_satd
Covers all variants: 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=386050088a66aa66bcaebb9b6b4b0a2b6af76a73
---
common/pixel.c | 5 ++
common/x86/pixel-a.asm | 206 +++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/pixel.h | 1 +
common/x86/x86util.asm | 26 +++----
4 files changed, 225 insertions(+), 13 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index c5edc9e9..b68bb4c2 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1351,6 +1351,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
#endif
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ INIT8( satd, _avx512 );
+ }
#endif //HAVE_MMX
#if HAVE_ARMV6
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 42e0dd08..9c6ed6c8 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4583,6 +4583,212 @@ cglobal intra_sad_x9_8x8, 5,7,8
mov rsp, r6
mov eax, r2d
RET
+
+%macro SATD_AVX512_LOAD4 2 ; size, opmask
+ vpbroadcast%1 m0, [r0]
+ vpbroadcast%1 m0 {%2}, [r0+2*r1]
+ vpbroadcast%1 m2, [r2]
+ vpbroadcast%1 m2 {%2}, [r2+2*r3]
+ add r0, r1
+ add r2, r3
+ vpbroadcast%1 m1, [r0]
+ vpbroadcast%1 m1 {%2}, [r0+2*r1]
+ vpbroadcast%1 m3, [r2]
+ vpbroadcast%1 m3 {%2}, [r2+2*r3]
+%endmacro
+
+%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
+ vpbroadcast%1 %{2}0, [r0]
+ vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
+ vpbroadcast%1 %{2}2, [r2]
+ vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
+ vpbroadcast%1 m0 {%4}, [r0+4*r1]
+ vpbroadcast%1 m2 {%4}, [r2+4*r3]
+ vpbroadcast%1 m0 {%5}, [r0+2*r4]
+ vpbroadcast%1 m2 {%5}, [r2+2*r5]
+ vpbroadcast%1 %{2}1, [r0+r1]
+ vpbroadcast%1 %{2}1 {%3}, [r0+r4]
+ vpbroadcast%1 %{2}3, [r2+r3]
+ vpbroadcast%1 %{2}3 {%3}, [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ vpbroadcast%1 m1 {%4}, [r0+r1]
+ vpbroadcast%1 m3 {%4}, [r2+r3]
+ vpbroadcast%1 m1 {%5}, [r0+r4]
+ vpbroadcast%1 m3 {%5}, [r2+r5]
+%endmacro
+
+%macro SATD_AVX512_PACKED 0
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
+ SUMSUB_BA w, 0, 1, 2
+ SBUTTERFLY qdq, 0, 1, 2
+ SUMSUB_BA w, 0, 1, 2
+ HMAXABSW2 0, 1, 2, 3
+%endmacro
+
+%macro SATD_AVX512_END 0
+ paddw m0 {k1}{z}, m1 ; zero-extend to dwords
+%if ARCH_X86_64
+%if mmsize == 64
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+%endif
+%if mmsize >= 32
+ vextracti128 xm1, ym0, 1
+ paddd xmm0, xm0, xm1
+%endif
+ punpckhqdq xmm1, xmm0, xmm0
+ paddd xmm0, xmm1
+ movq rax, xmm0
+ rorx rdx, rax, 32
+ add eax, edx
+%else
+ HADDD m0, m1
+ movd eax, xm0
+%endif
+ RET
+%endmacro
+
+%macro HMAXABSW2 4 ; a, b, tmp1, tmp2
+ pabsw m%1, m%1
+ pabsw m%2, m%2
+ psrldq m%3, m%1, 2
+ psrld m%4, m%2, 16
+ pmaxsw m%1, m%3
+ pmaxsw m%2, m%4
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_satd_16x8_internal
+ vbroadcasti64x4 m6, [hmul_16p]
+ kxnorb k2, k2, k2
+ mov r4d, 0x55555555
+ knotw k2, k2
+ kmovd k1, r4d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+satd_16x8_avx512:
+ vbroadcasti128 ym0, [r0]
+ vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4
+ vbroadcasti128 ym4, [r2]
+ vbroadcasti32x4 m4 {k2}, [r2+4*r3]
+ vbroadcasti128 ym2, [r0+2*r1]
+ vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6
+ vbroadcasti128 ym5, [r2+2*r3]
+ vbroadcasti32x4 m5 {k2}, [r2+2*r5]
+ DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
+ vbroadcasti128 ym1, [r0+r1]
+ vbroadcasti128 ym4, [r2+r3]
+ vbroadcasti128 ym3, [r0+r4]
+ vbroadcasti128 ym5, [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5
+ vbroadcasti32x4 m4 {k2}, [r2+r3]
+ vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7
+ vbroadcasti32x4 m5 {k2}, [r2+r5]
+ DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
+ HADAMARD4_V 0, 1, 2, 3, 4
+ HMAXABSW2 0, 2, 4, 5
+ HMAXABSW2 1, 3, 4, 5
+ paddw m4, m0, m2 ; m1
+ paddw m2, m1, m3 ; m0
+ ret
+
+cglobal pixel_satd_8x8_internal
+ vbroadcasti64x4 m4, [hmul_16p]
+ mov r4d, 0x55555555
+ kmovd k1, r4d ; 01010101
+ kshiftlb k2, k1, 5 ; 10100000
+ kshiftlb k3, k1, 4 ; 01010000
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+satd_8x8_avx512:
+ SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
+ SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5
+ ret
+
+cglobal pixel_satd_16x8, 4,6
+ call pixel_satd_16x8_internal_avx512
+ jmp satd_zmm_avx512_end
+
+cglobal pixel_satd_16x16, 4,6
+ call pixel_satd_16x8_internal_avx512
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m7, m0, m1
+ call satd_16x8_avx512
+ paddw m1, m7
+ jmp satd_zmm_avx512_end
+
+cglobal pixel_satd_8x8, 4,6
+ call pixel_satd_8x8_internal_avx512
+satd_zmm_avx512_end:
+ SATD_AVX512_END
+
+cglobal pixel_satd_8x16, 4,6
+ call pixel_satd_8x8_internal_avx512
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m5, m0, m1
+ call satd_8x8_avx512
+ paddw m1, m5
+ jmp satd_zmm_avx512_end
+
+INIT_YMM avx512
+cglobal pixel_satd_4x8_internal
+ vbroadcasti128 m4, [hmul_4p]
+ mov r4d, 0x55550c
+ kmovd k2, r4d ; 00001100
+ kshiftlb k3, k2, 2 ; 00110000
+ kshiftlb k4, k2, 4 ; 11000000
+ kshiftrd k1, k2, 8 ; 01010101
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+satd_4x8_avx512:
+ SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
+satd_ymm_avx512: ; 1 1 3 3 5 5 7 7
+ SATD_AVX512_PACKED
+ ret
+
+cglobal pixel_satd_8x4, 4,5
+ mova m4, [hmul_16p]
+ mov r4d, 0x5555
+ kmovw k1, r4d
+ SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
+ call satd_ymm_avx512 ; 3 1 3 1
+ jmp satd_ymm_avx512_end2
+
+cglobal pixel_satd_4x8, 4,6
+ call pixel_satd_4x8_internal_avx512
+satd_ymm_avx512_end:
+%if ARCH_X86_64 == 0
+ pop r5d
+ %assign regs_used 5
+%endif
+satd_ymm_avx512_end2:
+ SATD_AVX512_END
+
+cglobal pixel_satd_4x16, 4,6
+ call pixel_satd_4x8_internal_avx512
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m5, m0, m1
+ call satd_4x8_avx512
+ paddw m1, m5
+ jmp satd_ymm_avx512_end
+
+INIT_XMM avx512
+cglobal pixel_satd_4x4, 4,5
+ mova m4, [hmul_4p]
+ mov r4d, 0x550c
+ kmovw k2, r4d
+ kshiftrw k1, k2, 8
+ SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2
+ SATD_AVX512_PACKED ; 1 1 3 3
+ SWAP 0, 1
+ SATD_AVX512_END
+
%endif ; HIGH_BIT_DEPTH
;=============================================================================
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 2b0baa34..62b9fb42 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -75,6 +75,7 @@ DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( satd, avx2 )
+DECL_X1( satd, avx512 )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index ea40bc87..7a140ebd 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -303,24 +303,24 @@
%endmacro
%macro HADDD 2 ; sum junk
-%if sizeof%1 == 32
-%define %2 xmm%2
- vextracti128 %2, %1, 1
-%define %1 xmm%1
- paddd %1, %2
+%if sizeof%1 >= 64
+ vextracti32x8 ymm%2, zmm%1, 1
+ paddd ymm%1, ymm%2
%endif
-%if mmsize >= 16
- MOVHL %2, %1
- paddd %1, %2
+%if sizeof%1 >= 32
+ vextracti128 xmm%2, ymm%1, 1
+ paddd xmm%1, xmm%2
+%endif
+%if sizeof%1 >= 16
+ MOVHL xmm%2, xmm%1
+ paddd xmm%1, xmm%2
%endif
%if cpuflag(xop) && sizeof%1 == 16
- vphadddq %1, %1
+ vphadddq xmm%1, xmm%1
%else
- PSHUFLW %2, %1, q0032
- paddd %1, %2
+ PSHUFLW xmm%2, xmm%1, q1032
+ paddd xmm%1, xmm%2
%endif
-%undef %1
-%undef %2
%endmacro
%macro HADDW 2 ; reg, tmp
More information about the x264-devel
mailing list