[x265] [PATCH 042 of 307] x86:AVX512 ssd_s_32

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:40 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500629149 -19800
#      Fri Jul 21 14:55:49 2017 +0530
# Node ID 6b3b8ef0f37e0f7860f4f43c99e581674b19f9e3
# Parent  156acfb1bbb3cee56ed7b3337850a1fc9e4429ee
x86:AVX512 ssd_s_32

AVX2 performance   : 7.37x
AVX512 performance : 13.06x

diff -r 156acfb1bbb3 -r 6b3b8ef0f37e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jul 24 14:55:38 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jul 21 14:55:49 2017 +0530
@@ -3858,6 +3858,7 @@
 
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
+        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
 
         p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
diff -r 156acfb1bbb3 -r 6b3b8ef0f37e source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Mon Jul 24 14:55:38 2017 +0530
+++ b/source/common/x86/ssd-a.asm	Fri Jul 21 14:55:49 2017 +0530
@@ -3389,3 +3389,65 @@
     movd    eax, xm0
 %endif
     RET
+
+;-----------------------------------------------------------------------------
+; ssd_s avx512 code start
+;-----------------------------------------------------------------------------
+%macro PROCESS_SSD_S_32x8_AVX512 0
+    movu    m1, [r0]
+    movu    m2, [r0 + r1]
+    movu    m3, [r0 + 2 * r1]
+    movu    m4, [r0 + r3]
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+
+    lea     r0, [r0 + 4 * r1]
+
+    movu    m1, [r0]
+    movu    m2, [r0 + r1]
+    movu    m3, [r0 + 2 * r1]
+    movu    m4, [r0 + r3]
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
+;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH==0
+INIT_ZMM avx512
+cglobal pixel_ssd_s_32, 2,4,5
+    add     r1, r1
+    lea     r3, [r1 * 3]
+    pxor    m0, m0
+
+    PROCESS_SSD_S_32x8_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_32x8_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_32x8_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_32x8_AVX512
+
+    ; calculate sum and return
+    HADDD   m0, m1
+    movd    eax, xm0
+    RET
+%endif
+;-----------------------------------------------------------------------------
+; ssd_s avx512 code end
+;-----------------------------------------------------------------------------


More information about the x265-devel mailing list