[x265] [PATCH 048 of 307] x86: AVX512 convert_p2s_32xN for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:46 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500988851 -19800
#      Tue Jul 25 18:50:51 2017 +0530
# Node ID b4c2149e9bb1119857363094492b50e85593fb74
# Parent  d05b920865e7c9e8cc9441e77df888b48acb50d1
x86: AVX512 convert_p2s_32xN for high bit depth

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
32x8  |      7.85x        |       7.95x
32x16 |      9.54x        |      15.32x
32x24 |     10.02x        |      17.01x
32x32 |     10.97x        |      18.22x
32x64 |      9.82x        |      19.59x

diff -r d05b920865e7 -r b4c2149e9bb1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 25 18:28:43 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 25 18:50:51 2017 +0530
@@ -2234,6 +2234,20 @@
         p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
         p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
         p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
+        p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
+        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512);
+        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
+        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
+        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
+
     }
 }
 #else // if HIGH_BIT_DEPTH
diff -r d05b920865e7 -r b4c2149e9bb1 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue Jul 25 18:28:43 2017 +0530
+++ b/source/common/x86/ipfilter16.asm	Tue Jul 25 18:50:51 2017 +0530
@@ -377,6 +377,45 @@
     movu       [r2 + r4 + mmsize], m3
 %endmacro
 
+%macro P2S_32x8_AVX512 0
+    movu       m0, [r0]
+    movu       m1, [r0 + r1]
+    movu       m2, [r0 + r1 * 2]
+    movu       m3, [r0 + r5]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    movu       [r2], m0
+    movu       [r2 + r3], m1
+    movu       [r2 + r3 * 2], m2
+    movu       [r2 + r4], m3
+
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+
+    movu       m0, [r0]
+    movu       m1, [r0 + r1]
+    movu       m2, [r0 + r1 * 2]
+    movu       m3, [r0 + r5]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    movu       [r2], m0
+    movu       [r2 + r3], m1
+    movu       [r2 + r3 * 2], m2
+    movu       [r2 + r4], m3
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
 ;-----------------------------------------------------------------------------
@@ -475,6 +514,132 @@
     lea        r2, [r2 + r3 * 4]
     P2S_64x8_AVX512
     RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x8, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x16, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x24, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x32, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x48, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x64, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x8_AVX512
+    RET
 ;-----------------------------------------------------------------------------------------------------------------------------
 ;p2s avx512 code end
 ;-----------------------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list