[x265] [PATCH 047 of 307] x86: AVX512 convert_p2s_64xN for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:45 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500987523 -19800
#      Tue Jul 25 18:28:43 2017 +0530
# Node ID d05b920865e7c9e8cc9441e77df888b48acb50d1
# Parent  9e1401dcdfc3c9fb633d81b7b39321ac5969a245
x86: AVX512 convert_p2s_64xN for high bit depth

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
64x16 |     10.53x        |      18.40x
64x32 |     11.10x        |      19.51x
64x48 |     11.14x        |      19.07x
64x64 |     11.26x        |      20.25x

diff -r 9e1401dcdfc3 -r d05b920865e7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 25 16:23:42 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 25 18:28:43 2017 +0530
@@ -2230,6 +2230,10 @@
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx512);
 
+        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512);
+        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
+        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
+        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
     }
 }
 #else // if HIGH_BIT_DEPTH
diff -r 9e1401dcdfc3 -r d05b920865e7 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue Jul 25 16:23:42 2017 +0530
+++ b/source/common/x86/ipfilter16.asm	Tue Jul 25 18:28:43 2017 +0530
@@ -301,6 +301,183 @@
     FILTER_VER_LUMA_sse2 ps, 64, 16
     FILTER_VER_LUMA_sse2 ps, 16, 64
 
+;-----------------------------------------------------------------------------
+;p2s avx512 code start
+;-----------------------------------------------------------------------------
+%macro P2S_64x8_AVX512 0
+    movu       m0, [r0]
+    movu       m1, [r0 + r1]
+    movu       m2, [r0 + r1 * 2]
+    movu       m3, [r0 + r5]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    movu       [r2], m0
+    movu       [r2 + r3], m1
+    movu       [r2 + r3 * 2], m2
+    movu       [r2 + r4], m3
+
+    movu       m0, [r0 + mmsize]
+    movu       m1, [r0 + r1 + mmsize]
+    movu       m2, [r0 + r1 * 2 + mmsize]
+    movu       m3, [r0 + r5 + mmsize]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    movu       [r2 + mmsize], m0
+    movu       [r2 + r3 + mmsize], m1
+    movu       [r2 + r3 * 2 + mmsize], m2
+    movu       [r2 + r4 + mmsize], m3
+
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+
+    movu       m0, [r0]
+    movu       m1, [r0 + r1]
+    movu       m2, [r0 + r1 * 2]
+    movu       m3, [r0 + r5]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    movu       [r2], m0
+    movu       [r2 + r3], m1
+    movu       [r2 + r3 * 2], m2
+    movu       [r2 + r4], m3
+
+    movu       m0, [r0 + mmsize]
+    movu       m1, [r0 + r1 + mmsize]
+    movu       m2, [r0 + r1 * 2 + mmsize]
+    movu       m3, [r0 + r5 + mmsize]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    movu       [r2 + mmsize], m0
+    movu       [r2 + r3 + mmsize], m1
+    movu       [r2 + r3 * 2 + mmsize], m2
+    movu       [r2 + r4 + mmsize], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x16, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x32, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x48, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x64, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_64x8_AVX512
+    RET
+;-----------------------------------------------------------------------------------------------------------------------------
+;p2s avx512 code end
+;-----------------------------------------------------------------------------------------------------------------------------
 
 %macro PROCESS_LUMA_VER_W4_4R 0
     movq       m0, [r0]


More information about the x265-devel mailing list