[x265] [PATCH 053 of 307] x86: AVX512 pixel_add_ps_32xN for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:51 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501571354 -19800
#      Tue Aug 01 12:39:14 2017 +0530
# Node ID f8687bef93f25b343606e42f4fd252d5f0897d1a
# Parent  05972a61eb1aeac474ecc0d0150671e879177112
x86: AVX512 pixel_add_ps_32xN for high bit depth

AVX2 performance   : 12.77x
AVX512 performance : 21.54x

This patch also cleanup low bit depth code

diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Aug 01 10:56:55 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 01 12:39:14 2017 +0530
@@ -2198,6 +2198,9 @@
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
 
         p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
+        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512);
 
         // 64 X N
         p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
@@ -3893,8 +3896,8 @@
 
         p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512);
 
         p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
         p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm	Tue Aug 01 10:56:55 2017 +0530
+++ b/source/common/x86/pixeladd8.asm	Tue Aug 01 12:39:14 2017 +0530
@@ -769,132 +769,6 @@
 PIXEL_ADD_PS_W32_H4_avx2 64
 
 ;-----------------------------------------------------------------------------
-; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PROCESS_ADD_PS_32x8_AVX512 0
-    pmovzxbw    m0,         [r2]                ; row 0 of src0
-    movu        m1,         [r3]                ; row 0 of src1
-    pmovzxbw    m2,         [r2 + r4]           ; row 1 of src0
-    movu        m3,         [r3 + r5]           ; row 1 of src1
-    pmovzxbw    m4,         [r2 + r4 * 2]       ; row 2 of src0
-    movu        m5,         [r3 + r5 * 2]       ; row 2 of src1
-    pmovzxbw    m6,         [r2 + r7]           ; row 3 of src0
-    movu        m7,         [r3 + r8]           ; row 3 of src1
-
-    paddw       m0,         m1
-    paddw       m2,         m3
-    paddw       m4,         m5
-    paddw       m6,         m7
-    packuswb    m0,         m2
-    packuswb    m4,         m6
-    vpermq      m0,         m0, 11011000b
-    vpermq      m4,         m4, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    vshufi64x2  m4,         m4, 11011000b
-    movu        [r0],            ym0            ; row 0 of dst
-    movu        [r0 + r1 * 2],   ym4            ; row 2 of dst
-    vshufi64x2  m0,         m0, 01001110b
-    vshufi64x2  m4,         m4, 01001110b
-    movu        [r0 + r1],       ym0            ; row 1 of dst
-    movu        [r0 + r9],       ym4            ; row 3 of dst
-
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-
-    pmovzxbw    m0,         [r2]                ; row 4 of src0
-    movu        m1,         [r3]                ; row 4 of src1
-    pmovzxbw    m2,         [r2 + r4]           ; row 5 of src0
-    movu        m3,         [r3 + r5]           ; row 5 of src1
-    pmovzxbw    m4,         [r2 + r4 * 2]       ; row 6 of src0
-    movu        m5,         [r3 + r5 * 2]       ; row 6 of src1
-    pmovzxbw    m6,         [r2 + r7]           ; row 7 of src0
-    movu        m7,         [r3 + r8]           ; row 7 of src1
-
-    paddw       m0,         m1
-    paddw       m2,         m3
-    paddw       m4,         m5
-    paddw       m6,         m7
-    packuswb    m0,         m2
-    packuswb    m4,         m6
-    vpermq      m0,         m0, 11011000b
-    vpermq      m4,         m4, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    vshufi64x2  m4,         m4, 11011000b
-    movu        [r0],            ym0            ; row 4 of dst
-    movu        [r0 + r1 * 2],   ym4            ; row 6 of dst
-    vshufi64x2  m0,         m0, 01001110b
-    vshufi64x2  m4,         m4, 01001110b
-    movu        [r0 + r1],       ym0            ; row 5 of dst
-    movu        [r0 + r9],       ym4            ; row 7 of dst
-%endmacro
-
-
-%if HIGH_BIT_DEPTH==0
-%if ARCH_X86_64
-INIT_ZMM avx512
-cglobal pixel_add_ps_32x32, 6, 10, 8
-    add         r5,         r5
-    lea         r7,         [r4 * 3]
-    lea         r8,         [r5 * 3]
-    lea         r9,         [r1 * 3]
-
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    RET
-
-INIT_ZMM avx512
-cglobal pixel_add_ps_32x64, 6, 10, 8
-    add         r5,         r5
-    lea         r7,         [r4 * 3]
-    lea         r8,         [r5 * 3]
-    lea         r9,         [r1 * 3]
-
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    RET
-%endif
-%endif
-
-;-----------------------------------------------------------------------------
 ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
 %macro PIXEL_ADD_PS_W64_H2 2
@@ -1272,7 +1146,7 @@
 %endif
 
 ;-----------------------------------------------------------------------------
-; pixel_add_ps_64x64 avx512 code start
+; pixel_add_ps avx512 code start
 ;-----------------------------------------------------------------------------
 %macro PROCESS_ADD_PS_64x8_AVX512 0
     pmovzxbw    m0,         [r2]
@@ -1553,6 +1427,250 @@
     RET
 %endif
 %endif
+
+%macro PROCESS_ADD_PS_32x8_AVX512 0
+    pmovzxbw    m0,         [r2]
+    movu        m1,         [r3]
+    pmovzxbw    m2,         [r2 + r4]
+    movu        m3,         [r3 + r5]
+    paddw       m0,         m1
+    paddw       m2,         m3
+    packuswb    m0,         m2
+    vpermq      m0,         m0, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    movu        [r0],            ym0
+    vshufi64x2  m0,         m0, 01001110b
+    movu        [r0 + r1],       ym0
+
+    pmovzxbw    m0,         [r2 + r4 * 2]
+    movu        m1,         [r3 + r5 * 2]
+    pmovzxbw    m2,         [r2 + r6]
+    movu        m3,         [r3 + r7]
+    paddw       m0,         m1
+    paddw       m2,         m3
+    packuswb    m0,         m2
+    vpermq      m0,         m0, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    movu        [r0 + r1 * 2],   ym0
+    vshufi64x2  m0,         m0, 01001110b
+    movu        [r0 + r8],       ym0
+
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+
+    pmovzxbw    m0,         [r2]
+    movu        m1,         [r3]
+    pmovzxbw    m2,         [r2 + r4]
+    movu        m3,         [r3 + r5]
+    paddw       m0,         m1
+    paddw       m2,         m3
+    packuswb    m0,         m2
+    vpermq      m0,         m0, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    movu        [r0],            ym0
+    vshufi64x2  m0,         m0, 01001110b
+    movu        [r0 + r1],       ym0
+
+    pmovzxbw    m0,         [r2 + r4 * 2]
+    movu        m1,         [r3 + r5 * 2]
+    pmovzxbw    m2,         [r2 + r6]
+    movu        m3,         [r3 + r7]
+    paddw       m0,         m1
+    paddw       m2,         m3
+    packuswb    m0,         m2
+    vpermq      m0,         m0, 11011000b
+    vshufi64x2  m0,         m0, 11011000b
+    movu        [r0 + r1 * 2],   ym0
+    vshufi64x2  m0,         m0, 01001110b
+    movu        [r0 + r8],       ym0
+%endmacro
+
+%macro PROCESS_ADD_PS_32x8_HBD_AVX512 0
+    movu    m0,     [r2]
+    movu    m1,     [r2 + r4]
+    movu    m2,     [r3]
+    movu    m3,     [r3 + r5]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0],                m0
+    movu    [r0 + r1],           m1
+
+    movu    m0,     [r2 + r4 * 2]
+    movu    m1,     [r2 + r6]
+    movu    m2,     [r3 + r5 * 2]
+    movu    m3,     [r3 + r7]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r1 * 2],           m0
+    movu    [r0 + r8],               m1
+
+    lea     r0,     [r0 + r1 * 4]
+    lea     r2,     [r2 + r4 * 4]
+    lea     r3,     [r3 + r5 * 4]
+
+    movu    m0,     [r2]
+    movu    m1,     [r2 + r4]
+    movu    m2,     [r3]
+    movu    m3,     [r3 + r5]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0],                m0
+    movu    [r0 + r1],           m1
+
+    movu    m0,     [r2 + r4 * 2]
+    movu    m1,     [r2 + r6]
+    movu    m2,     [r3 + r5 * 2]
+    movu    m3,     [r3 + r7]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    movu    [r0 + r1 * 2],           m0
+    movu    [r0 + r8],               m1
+%endmacro
 ;-----------------------------------------------------------------------------
-; pixel_add_ps_64x64 avx512 code end
+; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x32, 6, 9, 6
+    vbroadcasti32x8  m5,     [pw_pixel_max]
+    pxor             m4,     m4
+    add             r4d,     r4d
+    add             r5d,     r5d
+    add             r1d,     r1d
+    lea              r6,     [r4 * 3]
+    lea              r7,     [r5 * 3]
+    lea              r8,     [r1 * 3]
+
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x64, 6, 9, 6
+    vbroadcasti32x8  m5,     [pw_pixel_max]
+    pxor             m4,     m4
+    add             r4d,     r4d
+    add             r5d,     r5d
+    add             r1d,     r1d
+    lea              r6,     [r4 * 3]
+    lea              r7,     [r5 * 3]
+    lea              r8,     [r1 * 3]
+
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_HBD_AVX512
+    RET
+%endif
+%else
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x32, 6, 9, 4
+    add         r5,         r5
+    lea         r6,         [r4 * 3]
+    lea         r7,         [r5 * 3]
+    lea         r8,         [r1 * 3]
+
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x64, 6, 9, 4
+    add         r5,         r5
+    lea         r6,         [r4 * 3]
+    lea         r7,         [r5 * 3]
+    lea         r8,         [r1 * 3]
+
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+    PROCESS_ADD_PS_32x8_AVX512
+    RET
+%endif
+%endif
+;-----------------------------------------------------------------------------
+; pixel_add_ps avx512 code end
+;-----------------------------------------------------------------------------


More information about the x265-devel mailing list