[x265] [PATCH 231 of 307] x86: AVX512 ssd_ss_16x16

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:49 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512371012 -19800
#      Mon Dec 04 12:33:32 2017 +0530
# Node ID 465682e66d91ecf207feae78c33e32f0eaaf45c4
# Parent  4f690222337dbc1757665729ea15f2380a11c329
x86: AVX512 ssd_ss_16x16
AVX2 performance   : 43.55x
AVX512 performance : 48.11x

This patch also cleanup already existing ssd_ss AVX512 code

diff -r 4f690222337d -r 465682e66d91 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Dec 01 10:30:38 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Dec 04 12:33:32 2017 +0530
@@ -4743,6 +4743,7 @@
 
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512);
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512);
 
diff -r 4f690222337d -r 465682e66d91 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Fri Dec 01 10:30:38 2017 +0530
+++ b/source/common/x86/ssd-a.asm	Mon Dec 04 12:33:32 2017 +0530
@@ -1390,183 +1390,120 @@
 ;-----------------------------------------------------------------------------
 ; ssd_ss avx512 code start
 ;-----------------------------------------------------------------------------
-%macro PROCESS_SSD_SS_64x8_AVX512 0
+%if ARCH_X86_64
+%macro PROCESS_SSD_SS_64x4_AVX512 0
     movu        m0, [r0]
     movu        m1, [r0 + mmsize]
     movu        m2, [r0 + r1]
     movu        m3, [r0 + r1 + mmsize]
-
-    psubw       m0, [r2]
-    psubw       m1, [r2 + mmsize]
-    psubw       m2, [r2 + r3]
-    psubw       m3, [r2 + r3 + mmsize]
+    movu        m4, [r2]
+    movu        m5, [r2 + mmsize]
+    movu        m6, [r2 + r3]
+    movu        m7, [r2 + r3 + mmsize]
+
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
     pmaddwd     m0, m0
     pmaddwd     m1, m1
     pmaddwd     m2, m2
     pmaddwd     m3, m3
-    paddd       m4, m0
-    paddd       m5, m1
-    paddd       m4, m2
-    paddd       m5, m3
+    paddd       m8, m0
+    paddd       m8, m1
+    paddd       m8, m2
+    paddd       m8, m3
 
     movu        m0, [r0 + 2 * r1]
     movu        m1, [r0 + 2 * r1 + mmsize]
     movu        m2, [r0 + r5]
     movu        m3, [r0 + r5 + mmsize]
-
-    psubw       m0, [r2 + 2 * r3]
-    psubw       m1, [r2 + 2 * r3 + mmsize]
-    psubw       m2, [r2 + r6]
-    psubw       m3, [r2 + r6 + mmsize]
+    movu        m4, [r2 + 2 * r3]
+    movu        m5, [r2 + 2 * r3 + mmsize]
+    movu        m6, [r2 + r6]
+    movu        m7, [r2 + r6 + mmsize]
+
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
     pmaddwd     m0, m0
     pmaddwd     m1, m1
     pmaddwd     m2, m2
     pmaddwd     m3, m3
-    paddd       m4, m0
-    paddd       m5, m1
-    paddd       m4, m2
-    paddd       m5, m3
-
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-
+    paddd       m8, m0
+    paddd       m8, m1
+    paddd       m8, m2
+    paddd       m8, m3
+%endmacro
+
+%macro PROCESS_SSD_SS_32x4_AVX512 0
     movu        m0, [r0]
-    movu        m1, [r0 + mmsize]
-    movu        m2, [r0 + r1]
-    movu        m3, [r0 + r1 + mmsize]
-
-    psubw       m0, [r2]
-    psubw       m1, [r2 + mmsize]
-    psubw       m2, [r2 + r3]
-    psubw       m3, [r2 + r3 + mmsize]
+    movu        m1, [r0 + r1]
+    movu        m2, [r0 + 2 * r1]
+    movu        m3, [r0 + r5]
+    movu        m4, [r2]
+    movu        m5, [r2 + r3]
+    movu        m6, [r2 + 2 * r3]
+    movu        m7, [r2 + r6]
+
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
     pmaddwd     m0, m0
     pmaddwd     m1, m1
     pmaddwd     m2, m2
     pmaddwd     m3, m3
-    paddd       m4, m0
-    paddd       m5, m1
-    paddd       m4, m2
-    paddd       m5, m3
-
-    movu        m0, [r0 + 2 * r1]
-    movu        m1, [r0 + 2 * r1 + mmsize]
-    movu        m2, [r0 + r5]
-    movu        m3, [r0 + r5 + mmsize]
-
-    psubw       m0, [r2 + 2 * r3]
-    psubw       m1, [r2 + 2 * r3 + mmsize]
-    psubw       m2, [r2 + r6]
-    psubw       m3, [r2 + r6 + mmsize]
+    paddd       m8, m0
+    paddd       m8, m1
+    paddd       m8, m2
+    paddd       m8, m3
+%endmacro
+
+%macro PROCESS_SSD_SS_16x4_AVX512 0
+    movu           ym0, [r0]
+    vinserti32x8    m0, [r0 + r1],    1
+    movu           ym1, [r0 + 2 * r1]
+    vinserti32x8    m1, [r0 + r5],    1
+    movu           ym4, [r2]
+    vinserti32x8    m4, [r2 + r3],    1
+    movu           ym5, [r2 + 2 * r3]
+    vinserti32x8    m5, [r2 + r6],    1
+
+    psubw       m0, m4
+    psubw       m1, m5
     pmaddwd     m0, m0
     pmaddwd     m1, m1
-    pmaddwd     m2, m2
-    pmaddwd     m3, m3
-    paddd       m4, m0
-    paddd       m5, m1
-    paddd       m4, m2
-    paddd       m5, m3
+    paddd       m8, m0
+    paddd       m8, m1
 %endmacro
 
-%macro PROCESS_SSD_SS_32x8_AVX512 0
-    movu        m0, [r0]
-    movu        m1, [r0 + r1]
-    movu        m2, [r0 + 2 * r1]
-    movu        m3, [r0 + r5]
-
-    psubw       m0, [r2]
-    psubw       m1, [r2 + r3]
-    psubw       m2, [r2 + 2 * r3]
-    psubw       m3, [r2 + r6]
-    pmaddwd     m0, m0
-    pmaddwd     m1, m1
-    pmaddwd     m2, m2
-    pmaddwd     m3, m3
-    paddd       m4, m0
-    paddd       m5, m1
-    paddd       m4, m2
-    paddd       m5, m3
-
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-
-    movu        m0, [r0]
-    movu        m1, [r0 + r1]
-    movu        m2, [r0 + 2 * r1]
-    movu        m3, [r0 + r5]
-
-    psubw       m0, [r2]
-    psubw       m1, [r2 + r3]
-    psubw       m2, [r2 + 2 * r3]
-    psubw       m3, [r2 + r6]
-    pmaddwd     m0, m0
-    pmaddwd     m1, m1
-    pmaddwd     m2, m2
-    pmaddwd     m3, m3
-    paddd       m4, m0
-    paddd       m5, m1
-    paddd       m4, m2
-    paddd       m5, m3
-%endmacro
-
+%macro SSD_SS_AVX512 2
 INIT_ZMM avx512
-cglobal pixel_ssd_ss_64x64, 4,7,6
+cglobal pixel_ssd_ss_%1x%2, 4,7,9
     add         r1d, r1d
     add         r3d, r3d
     lea         r5, [r1 * 3]
     lea         r6, [r3 * 3]
-    pxor        m4, m4
-    pxor        m5, m5
-
-    PROCESS_SSD_SS_64x8_AVX512
+    pxor        m8, m8
+
+%rep %2/4 - 1
+    PROCESS_SSD_SS_%1x4_AVX512
     lea         r0, [r0 + 4 * r1]
     lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_64x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_64x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_64x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_64x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_64x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_64x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_64x8_AVX512
-    paddd       m4, m5
-    HADDD       m4, m0
-    movd        eax, xm4
+%endrep
+    PROCESS_SSD_SS_%1x4_AVX512
+    HADDD       m8, m0
+    movd        eax, xm8
     RET
-
-INIT_ZMM avx512
-cglobal pixel_ssd_ss_32x32, 4,7,6
-    add         r1d, r1d
-    add         r3d, r3d
-    lea         r5, [r1 * 3]
-    lea         r6, [r3 * 3]
-    pxor        m4, m4
-    pxor        m5, m5
-
-    PROCESS_SSD_SS_32x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_32x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_32x8_AVX512
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-    PROCESS_SSD_SS_32x8_AVX512
-    paddd       m4, m5
-    HADDD       m4, m0
-    movd        eax, xm4
-    RET
+%endmacro
+
+
+SSD_SS_AVX512 64, 64
+SSD_SS_AVX512 32, 32
+SSD_SS_AVX512 16, 16
+%endif
 ;-----------------------------------------------------------------------------
 ; ssd_ss avx512 code end
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list