[x265] [PATCH 283 of 307] x86: AVX512 cpy2Dto1D_shr_32

Sat Apr 7 04:34:41 CEST 2018

# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515408269 -19800
#      Mon Jan 08 16:14:29 2018 +0530
# Node ID 6d1c8baabf78dba44a500f2039eace3e31abf69c
# Parent  48917be3e409f917468ff2f73302b62afef492fb
x86: AVX512 cpy2Dto1D_shr_32

AVX2 performance   : 21.17x
AVX512 performance : 34.33x

diff -r 48917be3e409 -r 6d1c8baabf78 source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Mon Jan 08 16:01:42 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 08 16:14:29 2018 +0530
@@ -2559,6 +2559,8 @@
         p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
 
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
+
         p.weight_pp = PFX(weight_pp_avx512);
         p.weight_sp = PFX(weight_sp_avx512);
         p.dequant_normal = PFX(dequant_normal_avx512);
@@ -4912,6 +4914,8 @@
         p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
 
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
+
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
 
diff -r 48917be3e409 -r 6d1c8baabf78 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Mon Jan 08 16:01:42 2018 +0530
+++ b/source/common/x86/blockcopy8.asm	Mon Jan 08 16:14:29 2018 +0530
@@ -5302,6 +5302,48 @@
     jnz        .loop
     RET
 
+INIT_ZMM avx512
+cglobal cpy2Dto1D_shr_32, 4, 5, 4
+    shl                 r2d,             1
+    movd                xm0,             r3d
+    pcmpeqw             ymm1,            ymm1
+    psllw               ym1,             ymm1,       xm0
+    psraw               ym1,             1
+    vinserti32x8        m1,              ym1,        1
+    lea                 r3,              [r2 * 3]
+    mov                 r4d,             8
+
+.loop:
+    ; Row 0
+    movu                m2,              [r1]
+    psubw               m2,              m1
+    psraw               m2,              xm0
+    movu                [r0],            m2
+
+    ; Row 1
+    movu                m2,              [r1 + r2]
+    psubw               m2,              m1
+    psraw               m2,              xm0
+    movu                [r0 + mmsize],   m2
+
+    ; Row 2
+    movu                m2,              [r1 + 2 * r2]
+    psubw               m2,              m1
+    psraw               m2,              xm0
+    movu                [r0 + 2 * mmsize], m2
+
+    ; Row 3
+    movu                m2,              [r1 + r3]
+    psubw               m2,              m1
+    psraw               m2,              xm0
+    movu                [r0 + 3 * mmsize], m2
+
+    add                 r0,              4 * mmsize
+    lea                 r1,              [r1 + 4 * r2]
+    dec                 r4d
+    jnz                 .loop
+    RET
+
 ;--------------------------------------------------------------------------------------
 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
diff -r 48917be3e409 -r 6d1c8baabf78 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Mon Jan 08 16:01:42 2018 +0530
+++ b/source/common/x86/blockcopy8.h	Mon Jan 08 16:14:29 2018 +0530
@@ -33,6 +33,7 @@
 FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);