[x265] [PATCH 038 of 307] x86: AVX512 getResidual32

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:36 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1500627732 -19800
#      Fri Jul 21 14:32:12 2017 +0530
# Node ID 49123506b563fd44378e856e6833c77812d0349e
# Parent  ef8989f43083cd5195ff3ba360959fe3900399e5
x86: AVX512 getResidual32

BIT_DEPTH = 8
AVX2 performance over C code   : 2.99x
AVX512 performance over C code : 5.46x

HIGH_BIT_DEPTH
AVX2 performance over C code   : 3.10x
AVX512 performance over C code : 5.60x

diff -r ef8989f43083 -r 49123506b563 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 04 18:02:59 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jul 21 14:32:12 2017 +0530
@@ -3723,6 +3723,7 @@
         p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2);
         p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2);
 
+        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
     }
     if (cpuMask & X265_CPU_AVX512)
     {
@@ -3859,6 +3860,8 @@
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
         p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
 
+        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
+
     }
 #endif
 }
diff -r ef8989f43083 -r 49123506b563 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Jul 04 18:02:59 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Fri Jul 21 14:32:12 2017 +0530
@@ -554,6 +554,135 @@
 %endrep
     RET
 %endif
+
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0
+    movu        m0, [r0]
+    movu        m1, [r0 + r3]
+    movu        m2, [r0 + r3 * 2]
+    movu        m3, [r0 + r4]
+    lea         r0, [r0 + r3 * 4]
+
+    movu        m4, [r1]
+    movu        m5, [r1 + r3]
+    movu        m6, [r1 + r3 * 2]
+    movu        m7, [r1 + r4]
+    lea         r1, [r1 + r3 * 4]
+
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
+
+    movu        [r2], m0
+    movu        [r2 + r3], m1
+    movu        [r2 + r3 * 2], m2
+    movu        [r2 + r4], m3
+    lea         r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0
+    movu        m0, [r0]
+    movu        m1, [r0 + r3]
+    movu        m2, [r0 + r3 * 2]
+    movu        m3, [r0 + r4]
+
+    movu        m4, [r1]
+    movu        m5, [r1 + r3]
+    movu        m6, [r1 + r3 * 2]
+    movu        m7, [r1 + r4]
+
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
+
+    movu        [r2], m0
+    movu        [r2 + r3], m1
+    movu        [r2 + r3 * 2], m2
+    movu        [r2 + r4], m3
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_AVX512 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r3]
+    pmovzxbw    m2, [r0 + r3 * 2]
+    pmovzxbw    m3, [r0 + r4]
+    lea         r0, [r0 + r3 * 4]
+
+    pmovzxbw    m4, [r1]
+    pmovzxbw    m5, [r1 + r3]
+    pmovzxbw    m6, [r1 + r3 * 2]
+    pmovzxbw    m7, [r1 + r4]
+    lea         r1, [r1 + r3 * 4]
+
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
+
+    movu        [r2], m0
+    movu        [r2 + r3 * 2], m1
+    lea         r2, [r2 + r3 * 4]
+    movu        [r2], m2
+    movu        [r2 + r3 * 2], m3
+    lea         r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r3]
+    pmovzxbw    m2, [r0 + r3 * 2]
+    pmovzxbw    m3, [r0 + r4]
+
+    pmovzxbw    m4, [r1]
+    pmovzxbw    m5, [r1 + r3]
+    pmovzxbw    m6, [r1 + r3 * 2]
+    pmovzxbw    m7, [r1 + r4]
+
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
+
+    movu        [r2], m0
+    movu        [r2 + r3 * 2], m1
+    lea         r2, [r2 + r3 * 4]
+    movu        [r2], m2
+    movu        [r2 + r3 * 2], m3
+%endmacro
+
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal getResidual32, 4,5,8
+    add         r3, r3
+    lea         r4, [r3 * 3]
+
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END
+    RET
+%else
+INIT_ZMM avx512
+cglobal getResidual32, 4,5,8
+    lea         r4, [r3 * 3]
+
+    PROCESS_GETRESIDUAL32_W4_AVX512
+    PROCESS_GETRESIDUAL32_W4_AVX512
+    PROCESS_GETRESIDUAL32_W4_AVX512
+    PROCESS_GETRESIDUAL32_W4_AVX512
+    PROCESS_GETRESIDUAL32_W4_AVX512
+    PROCESS_GETRESIDUAL32_W4_AVX512
+    PROCESS_GETRESIDUAL32_W4_AVX512
+    PROCESS_GETRESIDUAL32_W4_AVX512_END
+    RET
+%endif
+
 ;-----------------------------------------------------------------------------
 ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list