[x265] [PATCH 294 of 307] x86: AVX512 planecopy_sp_shl for input 10bit, output 12bit
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:52 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515999160 -19800
# Mon Jan 15 12:22:40 2018 +0530
# Node ID 1107c2def5f9dbee9947a2c9c41f50961fa31bc6
# Parent 3a310b157fdf345023ff4e96e7de316cee79b954
x86: AVX512 planecopy_sp_shl for input 10bit, output 12bit
AVX2 performance : 16.49x
AVX512 performance : 20.44x
diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 15 10:36:54 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 15 12:22:40 2018 +0530
@@ -3149,6 +3149,7 @@
p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx512);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx512);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x64_avx512);
+ p.planecopy_sp_shl = PFX(upShift_16_avx512);
}
#endif
@@ -5362,6 +5363,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
+ p.planecopy_sp_shl = PFX(upShift_16_avx512);
}
#endif
diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Jan 15 10:36:54 2018 +0530
+++ b/source/common/x86/pixel-a.asm Mon Jan 15 12:22:40 2018 +0530
@@ -8763,8 +8763,53 @@
.end:
RET
-
-
+INIT_ZMM avx512
+cglobal upShift_16, 4,7,4
+ mov r4d, r4m
+ mov r5d, r5m
+ movd xm0, r6m ; m0 = shift
+ vbroadcasti32x4 m3, [pw_pixel_max]
+ FIX_STRIDES r1d, r3d
+ dec r5d
+.loopH:
+ xor r6d, r6d
+.loopW:
+ movu m1, [r0 + r6 * SIZEOF_PIXEL]
+ psllw m1, xm0
+ pand m1, m3
+ movu [r2 + r6 * SIZEOF_PIXEL], m1
+
+ add r6, mmsize / SIZEOF_PIXEL
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+ ; processing last row of every frame [To handle width which not a multiple of 32]
+
+.loop32:
+ movu m1, [r0 + (r4 - mmsize/2) * 2]
+ psllw m1, xm0
+ pand m1, m3
+ movu [r2 + (r4 - mmsize/2) * 2], m1
+
+ sub r4d, mmsize/2
+ jz .end
+ cmp r4d, mmsize/2
+ jge .loop32
+
+ ; process partial pixels
+ movu m1, [r0]
+ psllw m1, xm0
+ pand m1, m3
+ movu [r2], m1
+
+.end:
+ RET
;---------------------------------------------------------------------------------------------------------------------
;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
;---------------------------------------------------------------------------------------------------------------------
diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Jan 15 10:36:54 2018 +0530
+++ b/source/common/x86/pixel.h Mon Jan 15 12:22:40 2018 +0530
@@ -34,6 +34,7 @@
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_avx512)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
More information about the x265-devel
mailing list