[x264-devel] x86: AVX-512 pixel_sa8d_8x8
Henrik Gramner
git at videolan.org
Mon May 22 00:04:00 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Fri Apr 28 21:35:25 2017 +0200| [1cf7baa462ca52de7f07d6e4c795853900bb50bb] | committer: Henrik Gramner
x86: AVX-512 pixel_sa8d_8x8
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1cf7baa462ca52de7f07d6e4c795853900bb50bb
---
common/pixel.c | 1 +
common/x86/pixel-a.asm | 34 +++++++++++++++++++++++++++++++++-
common/x86/pixel.h | 1 +
3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/common/pixel.c b/common/pixel.c
index b68bb4c2..c33a873f 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1355,6 +1355,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX512 )
{
INIT8( satd, _avx512 );
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512;
}
#endif //HAVE_MMX
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 9c6ed6c8..9b3dc27b 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4626,7 +4626,7 @@ cglobal intra_sad_x9_8x8, 5,7,8
HMAXABSW2 0, 1, 2, 3
%endmacro
-%macro SATD_AVX512_END 0
+%macro SATD_AVX512_END 0-1 0 ; sa8d
paddw m0 {k1}{z}, m1 ; zero-extend to dwords
%if ARCH_X86_64
%if mmsize == 64
@@ -4641,10 +4641,19 @@ cglobal intra_sad_x9_8x8, 5,7,8
paddd xmm0, xmm1
movq rax, xmm0
rorx rdx, rax, 32
+%if %1
+ lea eax, [rax+rdx+1]
+ shr eax, 1
+%else
add eax, edx
+%endif
%else
HADDD m0, m1
movd eax, xm0
+%if %1
+ inc eax
+ shr eax, 1
+%endif
%endif
RET
%endmacro
@@ -4789,6 +4798,29 @@ cglobal pixel_satd_4x4, 4,5
SWAP 0, 1
SATD_AVX512_END
+INIT_ZMM avx512
+cglobal pixel_sa8d_8x8, 4,6
+ vbroadcasti64x4 m4, [hmul_16p]
+ mov r4d, 0x55555555
+ kmovd k1, r4d ; 01010101
+ kshiftlb k2, k1, 5 ; 10100000
+ kshiftlb k3, k1, 4 ; 01010000
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5
+ SUMSUB_BA w, 0, 1, 2
+ SBUTTERFLY qdq, 0, 1, 2
+ SUMSUB_BA w, 0, 1, 2
+ shufps m2, m0, m1, q2020
+ shufps m1, m0, m1, q3131
+ SUMSUB_BA w, 2, 1, 0
+ vshufi32x4 m0, m2, m1, q1010
+ vshufi32x4 m1, m2, m1, q3232
+ SUMSUB_BA w, 0, 1, 2
+ HMAXABSW2 0, 1, 2, 3
+ SATD_AVX512_END 1
+
%endif ; HIGH_BIT_DEPTH
;=============================================================================
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 62b9fb42..d7753f53 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -84,6 +84,7 @@ DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_X1( sa8d, avx2 )
+DECL_X1( sa8d, avx512 )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
More information about the x264-devel
mailing list