[x265] [PATCH 034 of 307] x86: AVX512 ssd_ss_64x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:32 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500528397 -19800
#      Thu Jul 20 10:56:37 2017 +0530
# Node ID 0320e60b3323546eb6767508f1c39cd088e9f03e
# Parent  bf9a9cd255216300408506d10d4ff8bc87a15845
x86: AVX512 ssd_ss_64x64

AVX2 performance   : 14.85x
AVX512 performance : 21.35x

diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jul 20 13:12:52 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jul 20 10:56:37 2017 +0530
@@ -3851,6 +3851,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
 
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
+
     }
 #endif
 }
diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Thu Jul 20 13:12:52 2017 +0530
+++ b/source/common/x86/ssd-a.asm	Thu Jul 20 10:56:37 2017 +0530
@@ -1377,7 +1377,124 @@
     HADDD       m2, m0
     movd        eax, xm2
     RET
+;-----------------------------------------------------------------------------
+; ssd_ss avx512 code start
+;-----------------------------------------------------------------------------
+%macro PROCESS_SSD_SS_64x8_AVX512 0
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r0 + r1]
+    movu        m3, [r0 + r1 + mmsize]
 
+    psubw       m0, [r2]
+    psubw       m1, [r2 + mmsize]
+    psubw       m2, [r2 + r3]
+    psubw       m3, [r2 + r3 + mmsize]
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    pmaddwd     m2, m2
+    pmaddwd     m3, m3
+    paddd       m4, m0
+    paddd       m5, m1
+    paddd       m4, m2
+    paddd       m5, m3
+
+    movu        m0, [r0 + 2 * r1]
+    movu        m1, [r0 + 2 * r1 + mmsize]
+    movu        m2, [r0 + r5]
+    movu        m3, [r0 + r5 + mmsize]
+
+    psubw       m0, [r2 + 2 * r3]
+    psubw       m1, [r2 + 2 * r3 + mmsize]
+    psubw       m2, [r2 + r6]
+    psubw       m3, [r2 + r6 + mmsize]
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    pmaddwd     m2, m2
+    pmaddwd     m3, m3
+    paddd       m4, m0
+    paddd       m5, m1
+    paddd       m4, m2
+    paddd       m5, m3
+
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r0 + r1]
+    movu        m3, [r0 + r1 + mmsize]
+
+    psubw       m0, [r2]
+    psubw       m1, [r2 + mmsize]
+    psubw       m2, [r2 + r3]
+    psubw       m3, [r2 + r3 + mmsize]
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    pmaddwd     m2, m2
+    pmaddwd     m3, m3
+    paddd       m4, m0
+    paddd       m5, m1
+    paddd       m4, m2
+    paddd       m5, m3
+
+    movu        m0, [r0 + 2 * r1]
+    movu        m1, [r0 + 2 * r1 + mmsize]
+    movu        m2, [r0 + r5]
+    movu        m3, [r0 + r5 + mmsize]
+
+    psubw       m0, [r2 + 2 * r3]
+    psubw       m1, [r2 + 2 * r3 + mmsize]
+    psubw       m2, [r2 + r6]
+    psubw       m3, [r2 + r6 + mmsize]
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    pmaddwd     m2, m2
+    pmaddwd     m3, m3
+    paddd       m4, m0
+    paddd       m5, m1
+    paddd       m4, m2
+    paddd       m5, m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_ssd_ss_64x64, 4,7,6
+    add         r1d, r1d
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+    pxor        m4, m4
+    pxor        m5, m5
+
+    PROCESS_SSD_SS_64x8_AVX512
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+    PROCESS_SSD_SS_64x8_AVX512
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+    PROCESS_SSD_SS_64x8_AVX512
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+    PROCESS_SSD_SS_64x8_AVX512
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+    PROCESS_SSD_SS_64x8_AVX512
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+    PROCESS_SSD_SS_64x8_AVX512
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+    PROCESS_SSD_SS_64x8_AVX512
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+    PROCESS_SSD_SS_64x8_AVX512
+    paddd       m4, m5
+    HADDD       m4, m0
+    movd        eax, xm4
+    RET
+;-----------------------------------------------------------------------------
+; ssd_ss avx512 code end
+;-----------------------------------------------------------------------------
 %endif ; !HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0


More information about the x265-devel mailing list