[x265] [PATCH 031 of 307] x86: AVX512 convert_p2s_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:29 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500445753 -19800
#      Wed Jul 19 11:59:13 2017 +0530
# Node ID 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1
# Parent  a77082ebfa67b40f3dbb8cd45b54c17e710a104c
x86: AVX512 convert_p2s_32xN

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x8    |     1.51x       |      1.54x
32x16   |     2.18x       |      3.62x
32x24   |     2.26x       |      3.58x
32x32   |     2.28x       |      3.94x
32x64   |     2.20x       |      4.06x

diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jul 12 16:48:22 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jul 19 11:59:13 2017 +0530
@@ -3836,6 +3836,19 @@
         p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
         p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
         p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
+        p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
+        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512);
+        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
+        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
+        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
 
     }
 #endif
diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Jul 12 16:48:22 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Jul 19 11:59:13 2017 +0530
@@ -1956,6 +1956,184 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+%macro PROCESS_P2S_32x8_AVX512 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r1]
+    pmovzxbw    m2, [r0 + r1 * 2]
+    pmovzxbw    m3, [r0 + r5]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+
+    movu        [r2],           m0
+    movu        [r2 + r3],      m1
+    movu        [r2 + r3 * 2],  m2
+    movu        [r2 + r6],      m3
+
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r1]
+    pmovzxbw    m2, [r0 + r1 * 2]
+    pmovzxbw    m3, [r0 + r5]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+
+    movu        [r2],           m0
+    movu        [r2 + r3],      m1
+    movu        [r2 + r3 * 2],  m2
+    movu        [r2 + r6],      m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x8, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x16, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x24, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x32, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x48, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x64, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x8_AVX512
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
 %macro P2S_H_64xN 1
 INIT_XMM ssse3
 cglobal filterPixelToShort_64x%1, 3, 7, 6


More information about the x265-devel mailing list