[x265] [PATCH 252 of 307] x86: AVX512 intra_pred_dc32 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:10 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512723573 -19800
# Fri Dec 08 14:29:33 2017 +0530
# Node ID ddd64f4b2ff382d05e86708750b20332ed93f3c9
# Parent fa954ed4a1e7ce2741f3cac14006f78c3199191b
x86: AVX512 intra_pred_dc32 for high bit depth
AVX2 performance : 15.53x
AVX512 performance : 23.96x
diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Dec 08 14:29:33 2017 +0530
@@ -3053,6 +3053,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
}
#endif
diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/intrapred.h Fri Dec 08 14:29:33 2017 +0530
@@ -76,7 +76,7 @@
FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
-
+FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/intrapred16.asm Fri Dec 08 14:29:33 2017 +0530
@@ -688,6 +688,68 @@
movu [r0 + r2 * 1 + 0], m0
movu [r0 + r2 * 1 + mmsize], m0
RET
+
+INIT_ZMM avx512
+cglobal intra_pred_dc32, 3,3,17
+ add r2, 2
+ add r1d, r1d
+ movu m16, [r2]
+ movu m1, [r2 + 2 * mmsize]
+ paddw m16, m1
+ vextracti32x8 ym1, m16, 1
+ paddw ym16, ym1
+ vextracti32x4 xm1, m16, 1
+ paddw xm16, xm1
+ pmaddwd xm16, [pw_1]
+ movhlps xm1, xm16
+ paddd xm16, xm1
+ phaddd xm16, xm16
+ paddd xm16, [pd_32] ; sum = sum + 32
+ psrld xm16, 6 ; sum = sum / 64
+ vpbroadcastw m0, xm16
+
+ lea r2, [r1 * 3]
+ ; store DC 32x32
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r2 * 1 + 0], m0
+ RET
%endif
;---------------------------------------------------------------------------------------
More information about the x265-devel
mailing list