[x265] [PATCH 294 of 307] x86: AVX512 planecopy_sp_shl for input 10bit, output 12bit

Sat Apr 7 04:34:52 CEST 2018

# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515999160 -19800
#      Mon Jan 15 12:22:40 2018 +0530
# Node ID 1107c2def5f9dbee9947a2c9c41f50961fa31bc6
# Parent  3a310b157fdf345023ff4e96e7de316cee79b954
x86: AVX512 planecopy_sp_shl for input 10bit, output 12bit

AVX2 performance   : 16.49x
AVX512 performance : 20.44x

diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Mon Jan 15 10:36:54 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 15 12:22:40 2018 +0530
@@ -3149,6 +3149,7 @@
         p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx512);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x64_avx512);
+        p.planecopy_sp_shl = PFX(upShift_16_avx512);
 
     }
 #endif
@@ -5362,6 +5363,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
+        p.planecopy_sp_shl = PFX(upShift_16_avx512);
 
     }
 #endif
diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Jan 15 10:36:54 2018 +0530
+++ b/source/common/x86/pixel-a.asm	Mon Jan 15 12:22:40 2018 +0530
@@ -8763,8 +8763,53 @@
 
 .end:
     RET
-
-
+INIT_ZMM avx512
+cglobal upShift_16, 4,7,4
+    mov         r4d, r4m
+    mov         r5d, r5m
+    movd        xm0, r6m        ; m0 = shift
+    vbroadcasti32x4 m3, [pw_pixel_max]
+    FIX_STRIDES r1d, r3d
+    dec         r5d
+.loopH:
+    xor         r6d, r6d
+.loopW:
+    movu        m1, [r0 + r6 * SIZEOF_PIXEL]
+    psllw       m1, xm0
+    pand        m1, m3
+    movu        [r2 + r6 * SIZEOF_PIXEL], m1
+
+    add         r6, mmsize / SIZEOF_PIXEL
+    cmp         r6d, r4d
+    jl         .loopW
+
+    ; move to next row
+    add         r0, r1
+    add         r2, r3
+    dec         r5d
+    jnz        .loopH
+
+    ; processing last row of every frame [To handle width which not a multiple of 32]
+
+.loop32:
+    movu        m1, [r0 + (r4 - mmsize/2) * 2]
+    psllw       m1, xm0
+    pand        m1, m3
+    movu        [r2 + (r4 - mmsize/2) * 2], m1
+
+    sub         r4d, mmsize/2
+    jz         .end
+    cmp         r4d, mmsize/2
+    jge        .loop32
+
+    ; process partial pixels
+    movu        m1, [r0]
+    psllw       m1, xm0
+    pand        m1, m3
+    movu        [r2], m1
+
+.end:
+    RET
 ;---------------------------------------------------------------------------------------------------------------------
 ;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
 ;---------------------------------------------------------------------------------------------------------------------
diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Mon Jan 15 10:36:54 2018 +0530
+++ b/source/common/x86/pixel.h	Mon Jan 15 12:22:40 2018 +0530
@@ -34,6 +34,7 @@
 void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_avx512)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);