[x265] [PATCH 096 of 307] x86: AVX512 copy_cnt_32 and copy_cnt_16
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:34 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503557407 -19800
# Thu Aug 24 12:20:07 2017 +0530
# Node ID 0355f0128b7d713c4a21c91d3cc5bed1e8b43c47
# Parent 31a180bcef33fae436ad7e3aa4378b283a86d56a
x86: AVX512 copy_cnt_32 and copy_cnt_16
Size | BitDepth | AVX2 performance | AVX512 performance
-------------------------------------------------------
16x16| 8 | 6.92x | 8.07x
16x16| 10 | 6.72x | 7.75x
32x32| 8 | 6.08x | 10.33x
32x32| 10 | 6.04x | 10.16x
diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Aug 24 12:20:07 2017 +0530
@@ -2342,6 +2342,9 @@
p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
p.weight_pp = PFX(weight_pp_avx512);
+ p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
+ p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
+
}
}
#else // if HIGH_BIT_DEPTH
@@ -4054,6 +4057,9 @@
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+ p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
+ p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
+
//i444 chroma_hpp
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Aug 24 12:20:07 2017 +0530
@@ -5958,7 +5958,91 @@
movd eax, xm4
RET
-
+;--------------------------------------------------------------------------------------
+; copy_cnt avx512 code start
+;--------------------------------------------------------------------------------------
+%macro PROCESS_COPY_CNT_32x4_AVX512 0
+ movu m0, [r1]
+ movu m1, [r1 + r2]
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+ packsswb m0, m1
+ pminub m0, m3
+
+ movu m1, [r1 + 2 * r2]
+ movu m2, [r1 + r3]
+ movu [r0 + 2 * mmsize], m1
+ movu [r0 + 3 * mmsize], m2
+ packsswb m1, m2
+ pminub m1, m3
+
+ paddb m0, m1
+ paddb m4, m0
+%endmacro
+
+%macro PROCESS_COPY_CNT_16x4_AVX512 0
+ movu ym0, [r1]
+ vinserti32x8 m0, [r1 + r2], 1
+ movu ym1, [r1 + 2 * r2]
+ vinserti32x8 m1, [r1 + r3], 1
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+ packsswb m0, m1
+ pminub m0, m3
+ paddb m4, m0
+%endmacro
+
+%macro PROCESS_COPY_CNT_END_AVX512 0
+ pxor m0, m0
+ vextracti32x8 ym1, m4, 1
+ paddb ym4, ym1
+ vextracti32x4 xm1, ym4, 1
+ paddb xm4, xm1
+ psadbw xm4, xm0
+ movhlps xm1, xm4
+ paddd xm4, xm1
+ movd eax, xm4
+%endmacro
+
+;--------------------------------------------------------------------------------------
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
+;--------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal copy_cnt_32, 3, 4, 5
+ add r2d, r2d
+ lea r3, [3 * r2]
+
+ vbroadcasti32x8 m3, [pb_1]
+ pxor m4, m4
+
+%rep 7
+ PROCESS_COPY_CNT_32x4_AVX512
+ add r0, 4 * mmsize
+ lea r1, [r1 + 4 * r2]
+%endrep
+ PROCESS_COPY_CNT_32x4_AVX512
+ PROCESS_COPY_CNT_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal copy_cnt_16, 3, 4, 5
+ add r2d, r2d
+ lea r3, [3 * r2]
+
+ vbroadcasti32x8 m3, [pb_1]
+ pxor m4, m4
+
+%rep 3
+ PROCESS_COPY_CNT_16x4_AVX512
+ add r0, 2 * mmsize
+ lea r1, [r1 + 4 * r2]
+%endrep
+ PROCESS_COPY_CNT_16x4_AVX512
+ PROCESS_COPY_CNT_END_AVX512
+ RET
+;--------------------------------------------------------------------------------------
+; copy_cnt avx512 code end
+;--------------------------------------------------------------------------------------
;--------------------------------------------------------------------------------------
; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/blockcopy8.h Thu Aug 24 12:20:07 2017 +0530
@@ -46,6 +46,7 @@
FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride);
FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
More information about the x265-devel
mailing list