[x265] [PATCH 278 of 307] x86: AVX512 ssd_ss and ssd_pp for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:36 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515047865 -19800
#      Thu Jan 04 12:07:45 2018 +0530
# Node ID c225f79142184cb78ded05ba791edc9b3a40b3da
# Parent  4e9f2efdfd097910aa5bf704a4bbf38b0a28f2a5
x86: AVX512 ssd_ss and ssd_pp for high bit depth

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
32x32 |       6.83x       |      12.15x
64x64 |      14.36x       |      20.24x

diff -r 4e9f2efdfd09 -r c225f7914218 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Dec 29 09:52:27 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jan 04 12:07:45 2018 +0530
@@ -3124,6 +3124,14 @@
         p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
         p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
         p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
+
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx512);
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx512);
+        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx512);
+        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x64_avx512);
+
     }
 #endif
 }
diff -r 4e9f2efdfd09 -r c225f7914218 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Fri Dec 29 09:52:27 2017 +0530
+++ b/source/common/x86/ssd-a.asm	Thu Jan 04 12:07:45 2018 +0530
@@ -703,8 +703,109 @@
     paddq           xm3, xm4
     movq            rax, xm3
     RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_32x2
+    pxor            m0, m0
+    movu            m1, [r0]
+    psubw           m1, [r2]
+    pmaddwd         m1, m1
+    paddd           m0, m1
+    movu            m1, [r0 + r1]
+    psubw           m1, [r2 + r3]
+    pmaddwd         m1, m1
+    paddd           m0, m1
+    lea             r0, [r0 + r1 * 2]
+    lea             r2, [r2 + r3 * 2]
+
+    mova            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+
+    paddq           m3, m0
+    paddq           m3, m1
+ret
+
+INIT_ZMM avx512
+cglobal pixel_ssd_32x32, 4,5,5
+    shl             r1d, 1
+    shl             r3d, 1
+    pxor            m3, m3
+    mov             r4, 16
+.iterate:
+    call            pixel_ssd_32x2
+    dec             r4d
+    jne             .iterate
+
+    vextracti32x8   ym4, m3, 1
+    paddq           ym3, ym4
+    vextracti32x4   xm4, m3, 1
+    paddq           xm3, xm4
+    movhlps         xm4, xm3
+    paddq           xm3, xm4
+    movq            rax, xm3
+RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_32x64, 4,5,5
+    shl             r1d, 1
+    shl             r3d, 1
+    pxor            m3, m3
+    mov             r4, 32
+.iterate:
+    call            pixel_ssd_32x2
+    dec             r4d
+    jne             .iterate
+
+    vextracti32x8   ym4, m3, 1
+    paddq           ym3, ym4
+    vextracti32x4   xm4, m3, 1
+    paddq           xm3, xm4
+    movhlps         xm4, xm3
+    paddq           xm3, xm4
+    movq            rax, xm3
+RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_64x64, 4,5,5
+    FIX_STRIDES     r1, r3
+    mov             r4d, 64
+    pxor            m3, m3
+
+.loop:
+    pxor            m0, m0
+    movu            m1, [r0]
+    psubw           m1, [r2]
+    pmaddwd         m1, m1
+    paddd           m0, m1
+    movu            m1, [r0 + mmsize]
+    psubw           m1, [r2 + mmsize]
+    pmaddwd         m1, m1
+    paddd           m0, m1
+
+    lea             r0, [r0 + r1]
+    lea             r2, [r2 + r3]
+
+    mova            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m3, m0
+    paddq           m3, m1
+
+    dec             r4d
+    jg              .loop
+
+    vextracti32x8   ym4, m3, 1
+    paddq           ym3, ym4
+    vextracti32x4   xm4, m3, 1
+    paddq           xm3, xm4
+    movhlps         xm4, xm3
+    paddq           xm3, xm4
+    movq            rax, xm3
+    RET
 %endif
-
 INIT_MMX mmx2
 SSD_ONE     4,  4
 SSD_ONE     4,  8


More information about the x265-devel mailing list