[x265] [PATCH 050 of 307] x86: AVX512 ssd_s_16

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:48 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500967696 -19800
#      Tue Jul 25 12:58:16 2017 +0530
# Node ID 09159f73f47b7eda15c8d0294774fe6eafdadea7
# Parent  a75dd880817adddafac5e1105e512ea79c7a089b
x86: AVX512 ssd_s_16
This patch also reworks ssd_s_32 to support high bit depth

ssd_s_16
AVX2 performance   : 14.11x
AVX512 performance : 16.14x

ssd_s_32 for high bit depth
AVX2 performance   : 14.78x
AVX512 performance : 20.54x

diff -r a75dd880817a -r 09159f73f47b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jul 26 10:04:24 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 25 12:58:16 2017 +0530
@@ -2249,6 +2249,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
 
+        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
+
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -3919,6 +3921,7 @@
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
+        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512);
 
         p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
diff -r a75dd880817a -r 09159f73f47b source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Wed Jul 26 10:04:24 2017 +0530
+++ b/source/common/x86/ssd-a.asm	Tue Jul 25 12:58:16 2017 +0530
@@ -3425,10 +3425,28 @@
     paddd   m0, m1
 %endmacro
 
+%macro PROCESS_SSD_S_16x8_AVX512 0
+    movu             ym1,   [r0]
+    vinserti32x8     m1,    [r0 + r1],     1
+    movu             ym2,   [r0 + 2 * r1]
+    vinserti32x8     m2,    [r0 + r3],     1
+    lea              r0,    [r0 + 4 * r1]
+    movu             ym3,   [r0]
+    vinserti32x8     m3,    [r0 + r1],     1
+    movu             ym4,   [r0 + 2 * r1]
+    vinserti32x8     m4,    [r0 + r3],     1
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+%endmacro
 ;-----------------------------------------------------------------------------
 ; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
 ;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH==0
 INIT_ZMM avx512
 cglobal pixel_ssd_s_32, 2,4,5
     add     r1, r1
@@ -3444,10 +3462,39 @@
     PROCESS_SSD_S_32x8_AVX512
 
     ; calculate sum and return
+%if BIT_DEPTH >= 10
+    movu            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    vextracti32x8   ym2, m0, 1
+    paddq           ym0, ym2
+    vextracti32x4   xm2, m0, 1
+    paddq           xm2, xm0
+    movhlps         xm1, xm2
+    paddq           xm2, xm1
+    movq            rax, xm2
+%else
+    HADDD   m0, m1
+    movd    eax, xm0
+%endif
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_s_16, 2,4,5
+    add     r1, r1
+    lea     r3, [r1 * 3]
+    pxor    m0, m0
+
+    PROCESS_SSD_S_16x8_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_16x8_AVX512
+
+    ; calculate sum and return
     HADDD   m0, m1
     movd    eax, xm0
     RET
-%endif
 ;-----------------------------------------------------------------------------
 ; ssd_s avx512 code end
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list