[x265] [PATCH 281 of 307] x86: AVX512 cpy1Dto2D_shr_16

Sat Apr 7 04:34:39 CEST 2018

# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515407134 -19800
#      Mon Jan 08 15:55:34 2018 +0530
# Node ID c9f8c315a900c488e41bb39955a1c893e35a66d4
# Parent  8d466bd92e8fac3f24526c01a9532cb05ca82fa1
x86: AVX512 cpy1Dto2D_shr_16

AVX2 performance   : 17.79x
AVX512 performance : 25.49x

diff -r 8d466bd92e8f -r c9f8c315a900 source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Fri Jan 05 10:32:03 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 08 15:55:34 2018 +0530
@@ -2556,6 +2556,7 @@
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
         p.weight_pp = PFX(weight_pp_avx512);
         p.weight_sp = PFX(weight_sp_avx512);
         p.dequant_normal = PFX(dequant_normal_avx512);
@@ -4906,6 +4907,7 @@
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
 
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
diff -r 8d466bd92e8f -r c9f8c315a900 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Jan 05 10:32:03 2018 +0530
+++ b/source/common/x86/blockcopy8.asm	Mon Jan 08 15:55:34 2018 +0530
@@ -6688,6 +6688,37 @@
     jnz        .loop
     RET
 
+INIT_ZMM avx512
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
+    shl                 r2d,             1
+    movd                xm0,             r3m
+    pcmpeqw             ymm1,            ymm1
+    psllw               ym1,             ymm1,       xm0
+    psraw               ym1,             1
+    vinserti32x8        m1,              ym1,        1
+    mov                 r3d,             4
+    lea                 r4,              [r2 * 3]
+
+.loop:
+    ; Row 0-1
+    movu                m2,              [r1]
+    psubw               m2,              m1
+    psraw               m2,              xm0
+    movu                [r0],            ym2
+    vextracti32x8       [r0 + r2],       m2,         1
+
+    ; Row 2-3
+    movu                m2,              [r1 + mmsize]
+    psubw               m2,              m1
+    psraw               m2,              xm0
+    movu                [r0 + r2 * 2],   ym2
+    vextracti32x8       [r0 + r4],       m2,         1
+
+    add                 r1,              2 * mmsize
+    lea                 r0,              [r0 + r2 * 4]
+    dec                 r3d
+    jnz                 .loop
+    RET
 
 ;--------------------------------------------------------------------------------------
 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
diff -r 8d466bd92e8f -r c9f8c315a900 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Fri Jan 05 10:32:03 2018 +0530
+++ b/source/common/x86/blockcopy8.h	Mon Jan 08 15:55:34 2018 +0530
@@ -42,6 +42,7 @@
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
 FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
 FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);