[x265] [PATCH 055 of 307] [x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:53 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1501239623 -19800
#      Fri Jul 28 16:30:23 2017 +0530
# Node ID 215976d65b80985998b2597b8ba4c697f1465a1d
# Parent  e65ac86010af8f7ab1e5b43591330eeb6c818473
[x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x8    |     27.57x       |      35.17x
32x16   |     27.96x       |      40.74x
32x24   |     31.21x       |      45.19x
32x32   |     32.12x       |      47.23x
32x64   |     28.79x       |      53.35x

diff -r e65ac86010af -r 215976d65b80 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jul 28 16:30:23 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jul 28 16:30:23 2017 +0530
@@ -2262,6 +2262,12 @@
         p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
         p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
 
+        p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
+        p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
+        p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
+        p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
+        p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
+
     }
 }
 #else // if HIGH_BIT_DEPTH
diff -r e65ac86010af -r 215976d65b80 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Fri Jul 28 16:30:23 2017 +0530
+++ b/source/common/x86/sad16-a.asm	Fri Jul 28 16:30:23 2017 +0530
@@ -1208,6 +1208,179 @@
     movd           eax, xm0
 %endmacro
 
+%macro PROCESS_SAD_32x8_AVX512 0
+    movu    m1, [r2]
+    movu    m2, [r2 + r3]
+    movu    m3, [r2 + 2 * r3]
+    movu    m4, [r2 + r5]
+    psubw   m1, [r0]
+    psubw   m2, [r0 + r1]
+    psubw   m3, [r0 + 2 * r1]
+    psubw   m4, [r0 + r4]
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    pabsw   m4, m4
+    paddw   m1, m2
+    paddw   m3, m4
+    paddw   m5, m1, m3
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+
+    movu    m1, [r2]
+    movu    m2, [r2 + r3]
+    movu    m3, [r2 + 2 * r3]
+    movu    m4, [r2 + r5]
+    psubw   m1, [r0]
+    psubw   m2, [r0 + r1]
+    psubw   m3, [r0 + 2 * r1]
+    psubw   m4, [r0 + r4]
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    pabsw   m4, m4
+    paddw   m1, m2
+    paddw   m3, m4
+    paddw   m1, m3
+
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
+%endmacro
+
+%macro PROCESS_SAD_AVX512_END 0
+    vextracti32x8  ym1, m0, 1
+    paddd          ym0, ym1
+    vextracti64x2  xm1, m0, 1
+    paddd          xm0, xm1
+    pshufd         xm1, xm0, 00001110b
+    paddd          xm0, xm1
+    pshufd         xm1, xm0, 00000001b
+    paddd          xm0, xm1
+    movd           eax, xm0
+%endmacro
+
+
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal pixel_sad_32x8, 4,6,7
+    pxor    m0, m0
+
+    vbroadcasti32x8 m6, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+
+    PROCESS_SAD_32x8_AVX512
+    PROCESS_SAD_AVX512_END
+    RET
+
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x16, 4,6,7
+    pxor    m0, m0
+
+    vbroadcasti32x8 m6, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    PROCESS_SAD_AVX512_END
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x24, 4,6,7
+   pxor    m0, m0
+
+    vbroadcasti32x8 m6, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    PROCESS_SAD_AVX512_END
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x32, 4,6,7
+    pxor    m0, m0
+
+    vbroadcasti32x8 m6, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    PROCESS_SAD_AVX512_END
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x64, 4,6,7
+   pxor    m0, m0
+
+    vbroadcasti32x8 m6, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    lea            r2, [r2 + 4 * r3]
+    lea            r0, [r0 + 4 * r1]
+    PROCESS_SAD_32x8_AVX512
+    PROCESS_SAD_AVX512_END
+    RET
 
 
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list