[x265] [PATCH 262 of 307] x86: Aligned routine implementation for cpy1Dto2D_shl primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:20 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1513143310 -19800
# Wed Dec 13 11:05:10 2017 +0530
# Node ID 458b708e6d17aafb49a5fd369b2e9540d0268726
# Parent ab5b1becd807647d5264381c1fb74750c20fdfae
x86: Aligned routine implementation for cpy1Dto2D_shl primitive
1. cpy1Dto2D_shl optimization
2. Aligned code impementation
3. Linking with encoder
diff -r ab5b1becd807 -r 458b708e6d17 source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/pixel.cpp Wed Dec 13 11:05:10 2017 +0530
@@ -1004,7 +1004,8 @@
p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_c<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
- p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose<W>; \
diff -r ab5b1becd807 -r 458b708e6d17 source/common/primitives.h
--- a/source/common/primitives.h Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/primitives.h Wed Dec 13 11:05:10 2017 +0530
@@ -280,9 +280,8 @@
count_nonzero_t count_nonzero;
cpy2Dto1D_shl_t cpy2Dto1D_shl;
cpy2Dto1D_shr_t cpy2Dto1D_shr;
- cpy1Dto2D_shl_t cpy1Dto2D_shl;
+ cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_ALIGNMENT_TYPES];
cpy1Dto2D_shr_t cpy1Dto2D_shr;
-
copy_sp_t copy_sp;
copy_ps_t copy_ps;
copy_ss_t copy_ss;
diff -r ab5b1becd807 -r 458b708e6d17 source/common/quant.cpp
--- a/source/common/quant.cpp Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/quant.cpp Wed Dec 13 11:05:10 2017 +0530
@@ -560,13 +560,11 @@
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
{
const uint32_t sizeIdx = log2TrSize - 2;
-
if (cu.m_tqBypass[0])
{
- primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
+ primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0);
return;
}
-
// Values need to pass as input parameter in dequant
int rem = m_qpParam[ttype].rem;
int per = m_qpParam[ttype].per;
@@ -595,7 +593,7 @@
if (transformShift > 0)
primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
else
- primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
+ primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift);
#endif
}
else
diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 13 11:05:10 2017 +0530
@@ -989,7 +989,7 @@
ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
#if X86_64
@@ -1692,11 +1692,9 @@
ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
ASSIGN2(p.cu[BLOCK_32x32].blockfill_s, blockfill_s_32x32_avx2);
-
ALL_LUMA_TU(count_nonzero, count_nonzero, avx2);
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
-
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
@@ -2526,10 +2524,10 @@
p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
-
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
p.weight_pp = PFX(weight_pp_avx512);
p.weight_sp = PFX(weight_sp_avx512);
p.dequant_normal = PFX(dequant_normal_avx512);
@@ -3196,10 +3194,9 @@
ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
-
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
@@ -3794,12 +3791,9 @@
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
-
ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
-
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
-
p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2);
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx2);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx2);
@@ -4861,7 +4855,8 @@
p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual_aligned32_avx512);
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Wed Dec 13 11:05:10 2017 +0530
@@ -5599,17 +5599,58 @@
add r2d, r2d
movd xm0, r3d
lea r3, [3 * r2]
-
+%rep 3
PROCESS_CPY1Dto2D_SHL_32x8_AVX512
add r1, 4 * mmsize
lea r0, [r0 + r2 * 4]
+%endrep
PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+ RET
+
+%macro PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 0
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ psllw m3, xm0
+ psllw m4, xm0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + 2 * r2], m3
+ mova [r0 + r3], m4
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ psllw m3, xm0
+ psllw m4, xm0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + 2 * r2], m3
+ mova [r0 + r3], m4
+%endmacro
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal cpy1Dto2D_shl_aligned_32, 4, 4, 5
+ add r2d, r2d
+ movd xm0, r3d
+ lea r3, [3 * r2]
+%rep 3
+ PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
add r1, 4 * mmsize
lea r0, [r0 + r2 * 4]
- PROCESS_CPY1Dto2D_SHL_32x8_AVX512
- add r1, 4 * mmsize
- lea r0, [r0 + r2 * 4]
- PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+%endrep
+ PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
RET
;--------------------------------------------------------------------------------------
; copy_cnt avx512 code end
diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/x86/blockcopy8.h Wed Dec 13 11:05:10 2017 +0530
@@ -38,7 +38,7 @@
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-
+FUNCDEF_TU_S(void, cpy1Dto2D_shl_aligned, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r ab5b1becd807 -r 458b708e6d17 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Fri Dec 08 14:00:59 2017 +0530
+++ b/source/test/pixelharness.cpp Wed Dec 13 11:05:10 2017 +0530
@@ -469,12 +469,10 @@
return true;
}
-
bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
{
- ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
-
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
memset(ref_dest, 0xCD, sizeof(ref_dest));
memset(opt_dest, 0xCD, sizeof(opt_dest));
@@ -497,6 +495,33 @@
return true;
}
+bool PixelHarness::check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
+{
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+ intptr_t stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int shift = (rand() % 7 + 1);
+
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+ return false;
+
+ reportfail();
+ j += INCR + 32;
+ }
+
+ return true;
+}
bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
{
@@ -2597,15 +2622,22 @@
return false;
}
}
-
- if (opt.cu[i].cpy1Dto2D_shl)
+ if (opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
{
- if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl, opt.cu[i].cpy1Dto2D_shl))
+ if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[NONALIGNED], opt.cu[i].cpy1Dto2D_shl[NONALIGNED]))
{
printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
return false;
}
}
+ if (opt.cu[i].cpy1Dto2D_shl[ALIGNED])
+ {
+ if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[ALIGNED], opt.cu[i].cpy1Dto2D_shl[ALIGNED]))
+ {
+ printf("cpy1Dto2D_shl_aligned[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
if (opt.cu[i].cpy1Dto2D_shr)
{
@@ -3270,13 +3302,17 @@
HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
REPORT_SPEEDUP(opt.cu[i].cpy2Dto1D_shr, ref.cu[i].cpy2Dto1D_shr, sbuf1, sbuf2, STRIDE, 3);
}
-
- if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl)
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
{
HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl, ref.cu[i].cpy1Dto2D_shl, sbuf1, sbuf2, STRIDE, 64);
+ REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[NONALIGNED], ref.cu[i].cpy1Dto2D_shl[NONALIGNED], sbuf1, sbuf2, STRIDE, 64);
}
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[ALIGNED])
+ {
+ HEADER("cpy1Dto2D_shl_aligned[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[ALIGNED], ref.cu[i].cpy1Dto2D_shl[ALIGNED], sbuf1, sbuf2, STRIDE, 64);
+ }
if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shr)
{
HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
diff -r ab5b1becd807 -r 458b708e6d17 source/test/pixelharness.h
--- a/source/test/pixelharness.h Fri Dec 08 14:00:59 2017 +0530
+++ b/source/test/pixelharness.h Wed Dec 13 11:05:10 2017 +0530
@@ -97,6 +97,7 @@
bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
+ bool check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
bool check_pixel_var(var_t ref, var_t opt);
More information about the x265-devel
mailing list