[x265] [PATCH 052 of 307] x86: AVX512 pixel_add_ps_64x64 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:50 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501565215 -19800
#      Tue Aug 01 10:56:55 2017 +0530
# Node ID 05972a61eb1aeac474ecc0d0150671e879177112
# Parent  984cad60283b474ed756238cf904b08df290e103
x86: AVX512 pixel_add_ps_64x64 for high bit depth

AVX2 performance:   14.14x
AVX512 performance: 20.40x

diff -r 984cad60283b -r 05972a61eb1a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 25 16:37:38 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 01 10:56:55 2017 +0530
@@ -2197,6 +2197,8 @@
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
 
+        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
+
         // 64 X N
         p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512);
diff -r 984cad60283b -r 05972a61eb1a source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm	Tue Jul 25 16:37:38 2017 +0530
+++ b/source/common/x86/pixeladd8.asm	Tue Aug 01 10:56:55 2017 +0530
@@ -1272,7 +1272,7 @@
 %endif
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; pixel_add_ps_64x64 avx512 code start
 ;-----------------------------------------------------------------------------
 %macro PROCESS_ADD_PS_64x8_AVX512 0
     pmovzxbw    m0,         [r2]
@@ -1376,8 +1376,148 @@
     movu        [r0 + r1],  m4
 %endmacro
 
+%macro PROCESS_ADD_PS_64x8_HBD_AVX512 0
+    movu    m0,     [r2]
+    movu    m1,     [r2 + mmsize]
+    movu    m2,     [r3]
+    movu    m3,     [r3 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0],                m0
+    movu    [r0 + mmsize],       m1
+
+    movu    m0,     [r2 + r4]
+    movu    m1,     [r2 + r4 + mmsize]
+    movu    m2,     [r3 + r5]
+    movu    m3,     [r3 + r5 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r1],           m0
+    movu    [r0 + r1 + mmsize],  m1
+
+    movu    m0,     [r2 + r4 * 2]
+    movu    m1,     [r2 + r4 * 2 + mmsize]
+    movu    m2,     [r3 + r5 * 2]
+    movu    m3,     [r3 + r5 * 2 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r1 * 2],           m0
+    movu    [r0 + r1 * 2 + mmsize],  m1
+
+    movu    m0,     [r2 + r6]
+    movu    m1,     [r2 + r6 + mmsize]
+    movu    m2,     [r3 + r7]
+    movu    m3,     [r3 + r7 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r8],               m0
+    movu    [r0 + r8 + mmsize],      m1
+
+    lea     r0,     [r0 + r1 * 4]
+    lea     r2,     [r2 + r4 * 4]
+    lea     r3,     [r3 + r5 * 4]
+
+    movu    m0,     [r2]
+    movu    m1,     [r2 + mmsize]
+    movu    m2,     [r3]
+    movu    m3,     [r3 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0],                m0
+    movu    [r0 + mmsize],       m1
+
+    movu    m0,     [r2 + r4]
+    movu    m1,     [r2 + r4 + mmsize]
+    movu    m2,     [r3 + r5]
+    movu    m3,     [r3 + r5 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r1],           m0
+    movu    [r0 + r1 + mmsize],  m1
+
+    movu    m0,     [r2 + r4 * 2]
+    movu    m1,     [r2 + r4 * 2 + mmsize]
+    movu    m2,     [r3 + r5 * 2]
+    movu    m3,     [r3 + r5 * 2 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r1 * 2],           m0
+    movu    [r0 + r1 * 2 + mmsize],  m1
+
+    movu    m0,     [r2 + r6]
+    movu    m1,     [r2 + r6 + mmsize]
+    movu    m2,     [r3 + r7]
+    movu    m3,     [r3 + r7 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r8],               m0
+    movu    [r0 + r8 + mmsize],      m1
+%endmacro
+;-----------------------------------------------------------------------------
+; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
 %if ARCH_X86_64
-%if HIGH_BIT_DEPTH==0
+INIT_ZMM avx512
+cglobal pixel_add_ps_64x64, 6, 9, 6
+    vbroadcasti32x8  m5,     [pw_pixel_max]
+    pxor             m4,     m4
+    add             r4d,     r4d
+    add             r5d,     r5d
+    add             r1d,     r1d
+    lea              r6,     [r4 * 3]
+    lea              r7,     [r5 * 3]
+    lea              r8,     [r1 * 3]
+
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_64x8_HBD_AVX512
+    RET
+%endif
+%else
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_add_ps_64x64, 6, 7, 8
     add         r5,         r5
@@ -1411,6 +1551,8 @@
     lea         r0,         [r0 + r1 * 2]
     PROCESS_ADD_PS_64x8_AVX512
     RET
-
 %endif
 %endif
+;-----------------------------------------------------------------------------
+; pixel_add_ps_64x64 avx512 code end
+;-----------------------------------------------------------------------------


More information about the x265-devel mailing list