[x265] [PATCH 021 of 307] x86: AVX512 pixel_add_ps_64x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:19 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499923196 -19800
#      Thu Jul 13 10:49:56 2017 +0530
# Node ID 238c5ee9ad24dc6b283bb399eb013d937bc9ac1e
# Parent  a32718b2358bab3f19861d8402fe9adc8e312633
x86: AVX512 pixel_add_ps_64x64

AVX2 performance:    13.99x
AVX512 performance:  21.64x

diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 11 12:24:29 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jul 13 10:49:56 2017 +0530
@@ -3805,6 +3805,8 @@
         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
 
+        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
+
     }
 #endif
 }
diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm	Tue Jul 11 12:24:29 2017 +0530
+++ b/source/common/x86/pixeladd8.asm	Thu Jul 13 10:49:56 2017 +0530
@@ -1145,3 +1145,147 @@
     RET
 
 %endif
+
+;-----------------------------------------------------------------------------
+; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+;-----------------------------------------------------------------------------
+%macro PROCESS_ADD_PS_64x8_AVX512 0
+    pmovzxbw    m0,         [r2]
+    pmovzxbw    m1,         [r2 + 32]
+    movu        m2,         [r3]
+    movu        m3,         [r3 + 64]
+    pmovzxbw    m4,         [r2 + r4]
+    pmovzxbw    m5,         [r2 + r4 + 32]
+    movu        m6,         [r3 + r5]
+    movu        m7,         [r3 + r5 + 64]
+
+    paddw       m0,         m2
+    paddw       m1,         m3
+    paddw       m4,         m6
+    paddw       m5,         m7
+    packuswb    m0,         m1
+    packuswb    m4,         m5
+    vpermq      m0,         m0, 11011000b
+    vpermq      m4,         m4, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    vshufi64x2  m4,         m4, 11011000b
+    movu        [r0],       m0
+    movu        [r0 + r1],  m4
+
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+
+    pmovzxbw    m0,         [r2]
+    pmovzxbw    m1,         [r2 + 32]
+    movu        m2,         [r3]
+    movu        m3,         [r3 + 64]
+    pmovzxbw    m4,         [r2 + r4]
+    pmovzxbw    m5,         [r2 + r4 + 32]
+    movu        m6,         [r3 + r5]
+    movu        m7,         [r3 + r5 + 64]
+
+    paddw       m0,         m2
+    paddw       m1,         m3
+    paddw       m4,         m6
+    paddw       m5,         m7
+    packuswb    m0,         m1
+    packuswb    m4,         m5
+    vpermq      m0,         m0, 11011000b
+    vpermq      m4,         m4, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    vshufi64x2  m4,         m4, 11011000b
+    movu        [r0],       m0
+    movu        [r0 + r1],  m4
+
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+
+    pmovzxbw    m0,         [r2]
+    pmovzxbw    m1,         [r2 + 32]
+    movu        m2,         [r3]
+    movu        m3,         [r3 + 64]
+    pmovzxbw    m4,         [r2 + r4]
+    pmovzxbw    m5,         [r2 + r4 + 32]
+    movu        m6,         [r3 + r5]
+    movu        m7,         [r3 + r5 + 64]
+
+    paddw       m0,         m2
+    paddw       m1,         m3
+    paddw       m4,         m6
+    paddw       m5,         m7
+    packuswb    m0,         m1
+    packuswb    m4,         m5
+    vpermq      m0,         m0, 11011000b
+    vpermq      m4,         m4, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    vshufi64x2  m4,         m4, 11011000b
+    movu        [r0],       m0
+    movu        [r0 + r1],  m4
+
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+
+    pmovzxbw    m0,         [r2]
+    pmovzxbw    m1,         [r2 + 32]
+    movu        m2,         [r3]
+    movu        m3,         [r3 + 64]
+    pmovzxbw    m4,         [r2 + r4]
+    pmovzxbw    m5,         [r2 + r4 + 32]
+    movu        m6,         [r3 + r5]
+    movu        m7,         [r3 + r5 + 64]
+
+    paddw       m0,         m2
+    paddw       m1,         m3
+    paddw       m4,         m6
+    paddw       m5,         m7
+    packuswb    m0,         m1
+    packuswb    m4,         m5
+    vpermq      m0,         m0, 11011000b
+    vpermq      m4,         m4, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    vshufi64x2  m4,         m4, 11011000b
+    movu        [r0],       m0
+    movu        [r0 + r1],  m4
+%endmacro
+
+%if ARCH_X86_64
+%if HIGH_BIT_DEPTH==0
+INIT_ZMM avx512
+cglobal pixel_add_ps_64x64, 6, 7, 8
+    add         r5,         r5
+    PROCESS_ADD_PS_64x8_AVX512
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+    PROCESS_ADD_PS_64x8_AVX512
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+    PROCESS_ADD_PS_64x8_AVX512
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+    PROCESS_ADD_PS_64x8_AVX512
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+    PROCESS_ADD_PS_64x8_AVX512
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+    PROCESS_ADD_PS_64x8_AVX512
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+    PROCESS_ADD_PS_64x8_AVX512
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+    PROCESS_ADD_PS_64x8_AVX512
+    RET
+
+%endif
+%endif


More information about the x265-devel mailing list