[x265] [PATCH 032 of 307] x86: AVX512 convert_p2s 48x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:30 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500447343 -19800
#      Wed Jul 19 12:25:43 2017 +0530
# Node ID 97d5ab44b6da2db69584875c2dde97aef5533d9b
# Parent  60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1
x86: AVX512 convert_p2s 48x64

AVX2 performance  : 2.22x
AVX512 performance: 3.01x

diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jul 19 11:59:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jul 19 12:25:43 2017 +0530
@@ -3841,6 +3841,7 @@
         p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
         p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
         p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Jul 19 11:59:13 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Jul 19 12:25:43 2017 +0530
@@ -3047,6 +3047,115 @@
     jnz        .loop
     RET
 
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+%macro PROCESS_P2S_48x8_AVX512 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r1]
+    pmovzxbw    m2, [r0 + r1 * 2]
+    pmovzxbw    m3, [r0 + r5]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    movu        [r2],           m0
+    movu        [r2 + r3],      m1
+    movu        [r2 + r3 * 2],  m2
+    movu        [r2 + r6],      m3
+
+    pmovzxbw    ym0, [r0 + 32]
+    pmovzxbw    ym1, [r0 + r1 + 32]
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
+    pmovzxbw    ym3, [r0 + r5 + 32]
+    psllw       ym0, 6
+    psllw       ym1, 6
+    psllw       ym2, 6
+    psllw       ym3, 6
+    psubw       ym0, ym4
+    psubw       ym1, ym4
+    psubw       ym2, ym4
+    psubw       ym3, ym4
+    movu        [r2 + 64],           ym0
+    movu        [r2 + r3 + 64],      ym1
+    movu        [r2 + r3 * 2 + 64],  ym2
+    movu        [r2 + r6 + 64],      ym3
+
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r1]
+    pmovzxbw    m2, [r0 + r1 * 2]
+    pmovzxbw    m3, [r0 + r5]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    movu        [r2],           m0
+    movu        [r2 + r3],      m1
+    movu        [r2 + r3 * 2],  m2
+    movu        [r2 + r6],      m3
+
+    pmovzxbw    ym0, [r0 + 32]
+    pmovzxbw    ym1, [r0 + r1 + 32]
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
+    pmovzxbw    ym3, [r0 + r5 + 32]
+    psllw       ym0, 6
+    psllw       ym1, 6
+    psllw       ym2, 6
+    psllw       ym3, 6
+    psubw       ym0, ym4
+    psubw       ym1, ym4
+    psubw       ym2, ym4
+    psubw       ym3, ym4
+    movu        [r2 + 64],           ym0
+    movu        [r2 + r3 + 64],      ym1
+    movu        [r2 + r3 * 2 + 64],  ym2
+    movu        [r2 + r6 + 64],      ym3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_48x64, 3,7,5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd m8, [pw_2000]
+
+    PROCESS_P2S_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_48x8_AVX512
+    RET
 
 %macro PROCESS_LUMA_W4_4R 0
     movd        m0, [r0]


More information about the x265-devel mailing list