[x265] [PATCH 030 of 307] x86: AVX512 convert_p2s 64xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:28 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499858302 -19800
#      Wed Jul 12 16:48:22 2017 +0530
# Node ID a77082ebfa67b40f3dbb8cd45b54c17e710a104c
# Parent  576a93cba7d189fddba3466a21188f0ece3ed278
x86: AVX512 convert_p2s 64xN

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
64x16   |     2.05x       |      3.77x
64x32   |     2.16x       |      3.88x
64x48   |     2.13x       |      3.91x
64x64   |     2.16x       |      4.00x

diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jul 12 16:48:22 2017 +0530
@@ -3832,6 +3832,11 @@
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
 
+        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512);
+        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
+        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
+        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
+
     }
 #endif
 }
diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Jul 12 16:48:22 2017 +0530
@@ -2269,6 +2269,186 @@
     P2S_H_64xN_avx2 48
 
 ;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+%macro PROCESS_P2S_64x8_AVX512 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + mmsize/2]
+    pmovzxbw    m2, [r0 + r1]
+    pmovzxbw    m3, [r0 + r1 + mmsize/2]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    movu        [r2], m0
+    movu        [r2 + mmsize], m1
+    movu        [r2 + r3], m2
+    movu        [r2 + r3 + mmsize], m3
+
+    pmovzxbw    m0, [r0 + r1 * 2]
+    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
+    pmovzxbw    m2, [r0 + r5]
+    pmovzxbw    m3, [r0 + r5 + mmsize/2]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    movu        [r2 + r3 * 2], m0
+    movu        [r2 + r3 * 2 + mmsize], m1
+    movu        [r2 + r6], m2
+    movu        [r2 + r6 + mmsize], m3
+
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + mmsize/2]
+    pmovzxbw    m2, [r0 + r1]
+    pmovzxbw    m3, [r0 + r1 + mmsize/2]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    movu        [r2], m0
+    movu        [r2 + mmsize], m1
+    movu        [r2 + r3], m2
+    movu        [r2 + r3 + mmsize], m3
+
+    pmovzxbw    m0, [r0 + r1 * 2]
+    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
+    pmovzxbw    m2, [r0 + r5]
+    pmovzxbw    m3, [r0 + r5 + mmsize/2]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    movu        [r2 + r3 * 2], m0
+    movu        [r2 + r3 * 2 + mmsize], m1
+    movu        [r2 + r6], m2
+    movu        [r2 + r6 + mmsize], m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x64, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x48, 3, 7, 9
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m8, [pw_2000]
+
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x32, 3, 7, 9
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m8, [pw_2000]
+
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x16, 3, 7, 9
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m8, [pw_2000]
+
+    PROCESS_P2S_64x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_64x8_AVX512
+    RET
+
+;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
 %macro P2S_H_12xN 1
diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/ipfilter8.h	Wed Jul 12 16:48:22 2017 +0530
@@ -45,5 +45,6 @@
 SETUP_FUNC_DEF(sse3);
 SETUP_FUNC_DEF(sse4);
 SETUP_FUNC_DEF(avx2);
+SETUP_FUNC_DEF(avx512);
 
 #endif // ifndef X265_IPFILTER8_H


More information about the x265-devel mailing list