[x265] [PATCH 086 of 307] x86: AVX512 cleanup add_ps code

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:24 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502773372 -19800
#      Tue Aug 15 10:32:52 2017 +0530
# Node ID 2db192bac0f14d55f7f82b8964d6c67c3a3637c3
# Parent  6f811dfd5690866f4c432911982a30665dc0e91c
x86: AVX512 cleanup add_ps code

diff -r 6f811dfd5690 -r 2db192bac0f1 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm	Fri Aug 11 12:32:50 2017 +0530
+++ b/source/common/x86/pixeladd8.asm	Tue Aug 15 10:32:52 2017 +0530
@@ -24,11 +24,11 @@
 
 %include "x86inc.asm"
 %include "x86util.asm"
+SECTION_RODATA 64
 
-SECTION_RODATA 32
-
+ALIGN 64
+const store_shuf1_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
 SECTION .text
-
 cextern pw_pixel_max
 
 ;-----------------------------------------------------------------------------
@@ -1148,157 +1148,46 @@
 ;-----------------------------------------------------------------------------
 ; pixel_add_ps avx512 code start
 ;-----------------------------------------------------------------------------
-%macro PROCESS_ADD_PS_64x8_AVX512 0
+%macro PROCESS_ADD_PS_64x4_AVX512 0
     pmovzxbw    m0,         [r2]
     pmovzxbw    m1,         [r2 + 32]
     movu        m2,         [r3]
     movu        m3,         [r3 + 64]
-    pmovzxbw    m4,         [r2 + r4]
-    pmovzxbw    m5,         [r2 + r4 + 32]
-    movu        m6,         [r3 + r5]
-    movu        m7,         [r3 + r5 + 64]
-
     paddw       m0,         m2
     paddw       m1,         m3
-    paddw       m4,         m6
-    paddw       m5,         m7
     packuswb    m0,         m1
-    packuswb    m4,         m5
-    vpermq      m0,         m0, 11011000b
-    vpermq      m4,         m4, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    vshufi64x2  m4,         m4, 11011000b
+    vpermq      m0,         m4,      m0
     movu        [r0],       m0
-    movu        [r0 + r1],  m4
-
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-
-    pmovzxbw    m0,         [r2]
-    pmovzxbw    m1,         [r2 + 32]
-    movu        m2,         [r3]
-    movu        m3,         [r3 + 64]
-    pmovzxbw    m4,         [r2 + r4]
-    pmovzxbw    m5,         [r2 + r4 + 32]
-    movu        m6,         [r3 + r5]
-    movu        m7,         [r3 + r5 + 64]
-
+    pmovzxbw    m0,         [r2 + r4]
+    pmovzxbw    m1,         [r2 + r4 + 32]
+    movu        m2,         [r3 + r5]
+    movu        m3,         [r3 + r5 + 64]
     paddw       m0,         m2
     paddw       m1,         m3
-    paddw       m4,         m6
-    paddw       m5,         m7
     packuswb    m0,         m1
-    packuswb    m4,         m5
-    vpermq      m0,         m0, 11011000b
-    vpermq      m4,         m4, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    vshufi64x2  m4,         m4, 11011000b
-    movu        [r0],       m0
-    movu        [r0 + r1],  m4
-
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-
-    pmovzxbw    m0,         [r2]
-    pmovzxbw    m1,         [r2 + 32]
-    movu        m2,         [r3]
-    movu        m3,         [r3 + 64]
-    pmovzxbw    m4,         [r2 + r4]
-    pmovzxbw    m5,         [r2 + r4 + 32]
-    movu        m6,         [r3 + r5]
-    movu        m7,         [r3 + r5 + 64]
-
+    vpermq      m0,         m4,      m0
+    movu        [r0 + r1],  m0
+    pmovzxbw    m0,         [r2 + 2 * r4]
+    pmovzxbw    m1,         [r2 + 2 * r4 + 32]
+    movu        m2,         [r3 + 2 * r5]
+    movu        m3,         [r3 + 2 * r5 + 64]
     paddw       m0,         m2
     paddw       m1,         m3
-    paddw       m4,         m6
-    paddw       m5,         m7
     packuswb    m0,         m1
-    packuswb    m4,         m5
-    vpermq      m0,         m0, 11011000b
-    vpermq      m4,         m4, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    vshufi64x2  m4,         m4, 11011000b
-    movu        [r0],       m0
-    movu        [r0 + r1],  m4
+    vpermq      m0,         m4,      m0
+    movu        [r0 + 2 * r1],       m0
 
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-
-    pmovzxbw    m0,         [r2]
-    pmovzxbw    m1,         [r2 + 32]
-    movu        m2,         [r3]
-    movu        m3,         [r3 + 64]
-    pmovzxbw    m4,         [r2 + r4]
-    pmovzxbw    m5,         [r2 + r4 + 32]
-    movu        m6,         [r3 + r5]
-    movu        m7,         [r3 + r5 + 64]
-
+    pmovzxbw    m0,         [r2 + r7]
+    pmovzxbw    m1,         [r2 + r7 + 32]
+    movu        m2,         [r3 + r8]
+    movu        m3,         [r3 + r8 + 64]
     paddw       m0,         m2
     paddw       m1,         m3
-    paddw       m4,         m6
-    paddw       m5,         m7
     packuswb    m0,         m1
-    packuswb    m4,         m5
-    vpermq      m0,         m0, 11011000b
-    vpermq      m4,         m4, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    vshufi64x2  m4,         m4, 11011000b
-    movu        [r0],       m0
-    movu        [r0 + r1],  m4
+    vpermq      m0,         m4,      m0
+    movu        [r0 + r6],       m0
 %endmacro
-
-%macro PROCESS_ADD_PS_64x8_HBD_AVX512 0
-    movu    m0,     [r2]
-    movu    m1,     [r2 + mmsize]
-    movu    m2,     [r3]
-    movu    m3,     [r3 + mmsize]
-    paddw   m0,     m2
-    paddw   m1,     m3
-
-    CLIPW2  m0, m1, m4, m5
-    movu    [r0],                m0
-    movu    [r0 + mmsize],       m1
-
-    movu    m0,     [r2 + r4]
-    movu    m1,     [r2 + r4 + mmsize]
-    movu    m2,     [r3 + r5]
-    movu    m3,     [r3 + r5 + mmsize]
-    paddw   m0,     m2
-    paddw   m1,     m3
-
-    CLIPW2  m0, m1, m4, m5
-    movu    [r0 + r1],           m0
-    movu    [r0 + r1 + mmsize],  m1
-
-    movu    m0,     [r2 + r4 * 2]
-    movu    m1,     [r2 + r4 * 2 + mmsize]
-    movu    m2,     [r3 + r5 * 2]
-    movu    m3,     [r3 + r5 * 2 + mmsize]
-    paddw   m0,     m2
-    paddw   m1,     m3
-
-    CLIPW2  m0, m1, m4, m5
-    movu    [r0 + r1 * 2],           m0
-    movu    [r0 + r1 * 2 + mmsize],  m1
-
-    movu    m0,     [r2 + r6]
-    movu    m1,     [r2 + r6 + mmsize]
-    movu    m2,     [r3 + r7]
-    movu    m3,     [r3 + r7 + mmsize]
-    paddw   m0,     m2
-    paddw   m1,     m3
-
-    CLIPW2  m0, m1, m4, m5
-    movu    [r0 + r8],               m0
-    movu    [r0 + r8 + mmsize],      m1
-
-    lea     r0,     [r0 + r1 * 4]
-    lea     r2,     [r2 + r4 * 4]
-    lea     r3,     [r3 + r5 * 4]
-
+%macro PROCESS_ADD_PS_64x4_HBD_AVX512 0
     movu    m0,     [r2]
     movu    m1,     [r2 + mmsize]
     movu    m2,     [r3]
@@ -1343,6 +1232,7 @@
     movu    [r0 + r8],               m0
     movu    [r0 + r8 + mmsize],      m1
 %endmacro
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -1358,77 +1248,35 @@
     lea              r6,     [r4 * 3]
     lea              r7,     [r5 * 3]
     lea              r8,     [r1 * 3]
-
-    PROCESS_ADD_PS_64x8_HBD_AVX512
+%rep 15
+    PROCESS_ADD_PS_64x4_HBD_AVX512
     lea         r2,         [r2 + r4 * 4]
     lea         r3,         [r3 + r5 * 4]
     lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_64x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_64x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_64x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_64x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_64x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_64x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_64x8_HBD_AVX512
+%endrep
+    PROCESS_ADD_PS_64x4_HBD_AVX512
     RET
 %endif
 %else
 %if ARCH_X86_64
 INIT_ZMM avx512
-cglobal pixel_add_ps_64x64, 6, 7, 8
+cglobal pixel_add_ps_64x64, 6, 9, 4
     add         r5,         r5
-    PROCESS_ADD_PS_64x8_AVX512
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-    PROCESS_ADD_PS_64x8_AVX512
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-    PROCESS_ADD_PS_64x8_AVX512
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-    PROCESS_ADD_PS_64x8_AVX512
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-    PROCESS_ADD_PS_64x8_AVX512
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-    PROCESS_ADD_PS_64x8_AVX512
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-    PROCESS_ADD_PS_64x8_AVX512
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-    PROCESS_ADD_PS_64x8_AVX512
+    lea         r6,         [3 * r1]
+    lea         r7,         [3 * r4]
+    lea         r8,         [3 * r5]
+    mova        m4,         [store_shuf1_avx512]
+%rep 15
+    PROCESS_ADD_PS_64x4_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+%endrep
+    PROCESS_ADD_PS_64x4_AVX512
     RET
 %endif
 %endif
-
-%macro PROCESS_ADD_PS_32x8_AVX512 0
+%macro PROCESS_ADD_PS_32x4_AVX512 0
     pmovzxbw    m0,         [r2]
     movu        m1,         [r3]
     pmovzxbw    m2,         [r2 + r4]
@@ -1436,12 +1284,9 @@
     paddw       m0,         m1
     paddw       m2,         m3
     packuswb    m0,         m2
-    vpermq      m0,         m0, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    movu        [r0],            ym0
-    vshufi64x2  m0,         m0, 01001110b
-    movu        [r0 + r1],       ym0
-
+    vpermq      m0,         m4,      m0
+    movu           [r0],       ym0
+    vextracti32x8  [r0 + r1],   m0,    1
     pmovzxbw    m0,         [r2 + r4 * 2]
     movu        m1,         [r3 + r5 * 2]
     pmovzxbw    m2,         [r2 + r6]
@@ -1449,70 +1294,11 @@
     paddw       m0,         m1
     paddw       m2,         m3
     packuswb    m0,         m2
-    vpermq      m0,         m0, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    movu        [r0 + r1 * 2],   ym0
-    vshufi64x2  m0,         m0, 01001110b
-    movu        [r0 + r8],       ym0
-
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-
-    pmovzxbw    m0,         [r2]
-    movu        m1,         [r3]
-    pmovzxbw    m2,         [r2 + r4]
-    movu        m3,         [r3 + r5]
-    paddw       m0,         m1
-    paddw       m2,         m3
-    packuswb    m0,         m2
-    vpermq      m0,         m0, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    movu        [r0],            ym0
-    vshufi64x2  m0,         m0, 01001110b
-    movu        [r0 + r1],       ym0
-
-    pmovzxbw    m0,         [r2 + r4 * 2]
-    movu        m1,         [r3 + r5 * 2]
-    pmovzxbw    m2,         [r2 + r6]
-    movu        m3,         [r3 + r7]
-    paddw       m0,         m1
-    paddw       m2,         m3
-    packuswb    m0,         m2
-    vpermq      m0,         m0, 11011000b
-    vshufi64x2  m0,         m0, 11011000b
-    movu        [r0 + r1 * 2],   ym0
-    vshufi64x2  m0,         m0, 01001110b
-    movu        [r0 + r8],       ym0
+    vpermq      m0,         m4,      m0
+    movu           [r0 + r1 * 2],   ym0
+    vextracti32x8  [r0 + r8],        m0,    1
 %endmacro
-
-%macro PROCESS_ADD_PS_32x8_HBD_AVX512 0
-    movu    m0,     [r2]
-    movu    m1,     [r2 + r4]
-    movu    m2,     [r3]
-    movu    m3,     [r3 + r5]
-    paddw   m0,     m2
-    paddw   m1,     m3
-
-    CLIPW2  m0, m1, m4, m5
-    movu    [r0],                m0
-    movu    [r0 + r1],           m1
-
-    movu    m0,     [r2 + r4 * 2]
-    movu    m1,     [r2 + r6]
-    movu    m2,     [r3 + r5 * 2]
-    movu    m3,     [r3 + r7]
-    paddw   m0,     m2
-    paddw   m1,     m3
-
-    CLIPW2  m0, m1, m4, m5
-    movu    [r0 + r1 * 2],           m0
-    movu    [r0 + r8],               m1
-
-    lea     r0,     [r0 + r1 * 4]
-    lea     r2,     [r2 + r4 * 4]
-    lea     r3,     [r3 + r5 * 4]
-
+%macro PROCESS_ADD_PS_32x4_HBD_AVX512 0
     movu    m0,     [r2]
     movu    m1,     [r2 + r4]
     movu    m2,     [r3]
@@ -1535,6 +1321,7 @@
     movu    [r0 + r1 * 2],           m0
     movu    [r0 + r8],               m1
 %endmacro
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -1550,22 +1337,14 @@
     lea              r6,     [r4 * 3]
     lea              r7,     [r5 * 3]
     lea              r8,     [r1 * 3]
-
-    PROCESS_ADD_PS_32x8_HBD_AVX512
+%rep 7
+    PROCESS_ADD_PS_32x4_HBD_AVX512
     lea         r2,         [r2 + r4 * 4]
     lea         r3,         [r3 + r5 * 4]
     lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
+%endrep
+    PROCESS_ADD_PS_32x4_HBD_AVX512
     RET
-
 INIT_ZMM avx512
 cglobal pixel_add_ps_32x64, 6, 9, 6
     vbroadcasti32x8  m5,     [pw_pixel_max]
@@ -1576,98 +1355,48 @@
     lea              r6,     [r4 * 3]
     lea              r7,     [r5 * 3]
     lea              r8,     [r1 * 3]
-
-    PROCESS_ADD_PS_32x8_HBD_AVX512
+%rep 15
+    PROCESS_ADD_PS_32x4_HBD_AVX512
     lea         r2,         [r2 + r4 * 4]
     lea         r3,         [r3 + r5 * 4]
     lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_HBD_AVX512
+%endrep
+    PROCESS_ADD_PS_32x4_HBD_AVX512
     RET
 %endif
 %else
 %if ARCH_X86_64
 INIT_ZMM avx512
-cglobal pixel_add_ps_32x32, 6, 9, 4
+cglobal pixel_add_ps_32x32, 6, 9, 5
     add         r5,         r5
     lea         r6,         [r4 * 3]
     lea         r7,         [r5 * 3]
     lea         r8,         [r1 * 3]
-
-    PROCESS_ADD_PS_32x8_AVX512
+    mova        m4,         [store_shuf1_avx512]
+%rep 7
+    PROCESS_ADD_PS_32x4_AVX512
     lea         r2,         [r2 + r4 * 4]
     lea         r3,         [r3 + r5 * 4]
     lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
+%endrep
+    PROCESS_ADD_PS_32x4_AVX512
     RET
 
 INIT_ZMM avx512
-cglobal pixel_add_ps_32x64, 6, 9, 4
+cglobal pixel_add_ps_32x64, 6, 9, 5
     add         r5,         r5
     lea         r6,         [r4 * 3]
     lea         r7,         [r5 * 3]
     lea         r8,         [r1 * 3]
+    mova        m4,         [store_shuf1_avx512]
 
-    PROCESS_ADD_PS_32x8_AVX512
+%rep 15
+    PROCESS_ADD_PS_32x4_AVX512
     lea         r2,         [r2 + r4 * 4]
     lea         r3,         [r3 + r5 * 4]
     lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
-    lea         r2,         [r2 + r4 * 4]
-    lea         r3,         [r3 + r5 * 4]
-    lea         r0,         [r0 + r1 * 4]
-    PROCESS_ADD_PS_32x8_AVX512
+%endrep
+    PROCESS_ADD_PS_32x4_AVX512
     RET
 %endif
 %endif


More information about the x265-devel mailing list