[x265] [PATCH 073 of 307] x86: AVX512 cpy1Dto2D_shl_32

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:11 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502186111 -19800
#      Tue Aug 08 15:25:11 2017 +0530
# Node ID 7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d
# Parent  ce93c1b1894ae7d789e451f65479f018ba90ec76
x86: AVX512 cpy1Dto2D_shl_32

Size | BitDepth | AVX2 performance | AVX512 performance
-------------------------------------------------------
32x32|    8     |     16.03x       |       28.94x
32x32|    10    |     14.12x       |       24.99x

diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 02 14:11:31 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 08 15:25:11 2017 +0530
@@ -2311,6 +2311,8 @@
 
         p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -3992,6 +3994,7 @@
         p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
         p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
 
     }
 #endif
diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Wed Aug 02 14:11:31 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Aug 08 15:25:11 2017 +0530
@@ -5513,7 +5513,62 @@
     jnz        .loop
     RET
 
-
+;--------------------------------------------------------------------------------------
+; cpy_1Dto2D_shl avx512 code start
+;--------------------------------------------------------------------------------------
+%macro PROCESS_CPY1Dto2D_SHL_32x8_AVX512 0
+    movu        m1,            [r1 + 0 * mmsize]
+    movu        m2,            [r1 + 1 * mmsize]
+    movu        m3,            [r1 + 2 * mmsize]
+    movu        m4,            [r1 + 3 * mmsize]
+    psllw       m1,            xm0
+    psllw       m2,            xm0
+    psllw       m3,            xm0
+    psllw       m4,            xm0
+    movu        [r0],          m1
+    movu        [r0 + r2],     m2
+    movu        [r0 + 2 * r2], m3
+    movu        [r0 + r3],     m4
+
+    add         r1,            4 * mmsize
+    lea         r0,            [r0 + r2 * 4]
+
+    movu        m1,            [r1 + 0 * mmsize]
+    movu        m2,            [r1 + 1 * mmsize]
+    movu        m3,            [r1 + 2 * mmsize]
+    movu        m4,            [r1 + 3 * mmsize]
+    psllw       m1,            xm0
+    psllw       m2,            xm0
+    psllw       m3,            xm0
+    psllw       m4,            xm0
+    movu        [r0],          m1
+    movu        [r0 + r2],     m2
+    movu        [r0 + 2 * r2], m3
+    movu        [r0 + r3],     m4
+%endmacro
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal cpy1Dto2D_shl_32, 4, 4, 5
+    add         r2d, r2d
+    movd        xm0, r3d
+    lea         r3, [3 * r2]
+
+    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
+    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
+    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
+    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+    RET
+;--------------------------------------------------------------------------------------
+; copy_cnt avx512 code end
+;--------------------------------------------------------------------------------------
 ;--------------------------------------------------------------------------------------
 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
 ;--------------------------------------------------------------------------------------
diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Wed Aug 02 14:11:31 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Tue Aug 08 15:25:11 2017 +0530
@@ -37,6 +37,7 @@
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);


More information about the x265-devel mailing list