[x265] [PATCH 116 of 307] x86: Aligned routine implementation for calcresidual primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:54 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507115013 -19800
# Wed Oct 04 16:33:33 2017 +0530
# Node ID c497cbf5c2d53ea9c47f3929eaacbb36e703bdfa
# Parent d4ee703039c6cde39312a596cee019c346a8381b
x86: Aligned routine implementation for calcresidual primitive
diff -r d4ee703039c6 -r c497cbf5c2d5 source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Oct 05 11:16:10 2017 +0530
+++ b/source/common/pixel.cpp Wed Oct 04 16:33:33 2017 +0530
@@ -1010,6 +1010,7 @@
p.cu[BLOCK_ ## W ## x ## H].ssd_s = pixel_ssd_s_c<W>; \
p.cu[BLOCK_ ## W ## x ## H].var = pixel_var<W>; \
p.cu[BLOCK_ ## W ## x ## H].calcresidual = getResidual<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].calcresidual_aligned = getResidual<W>; \
p.cu[BLOCK_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
p.cu[BLOCK_ ## W ## x ## H].sse_ss = sse<W, H, int16_t, int16_t>;
diff -r d4ee703039c6 -r c497cbf5c2d5 source/common/primitives.h
--- a/source/common/primitives.h Thu Oct 05 11:16:10 2017 +0530
+++ b/source/common/primitives.h Wed Oct 04 16:33:33 2017 +0530
@@ -268,6 +268,7 @@
dct_t lowpass_dct; // lowpass dct approximation
calcresidual_t calcresidual;
+ calcresidual_t calcresidual_aligned;
pixel_sub_ps_t sub_ps;
pixel_add_ps_t add_ps;
blockfill_s_t blockfill_s; // block fill, for DC transforms
diff -r d4ee703039c6 -r c497cbf5c2d5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Oct 05 11:16:10 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Oct 04 16:33:33 2017 +0530
@@ -4450,6 +4450,8 @@
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
+ p.cu[BLOCK_16x16].calcresidual_aligned = PFX(getResidual16_avx2);
+ p.cu[BLOCK_32x32].calcresidual_aligned = PFX(getResidual_aligned32_avx512);
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
diff -r d4ee703039c6 -r c497cbf5c2d5 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Thu Oct 05 11:16:10 2017 +0530
+++ b/source/common/x86/pixel-util.h Wed Oct 04 16:33:33 2017 +0530
@@ -27,6 +27,7 @@
#define DEFINE_UTILS(cpu) \
FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
+ FUNCDEF_TU_S2(void, getResidual_aligned, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
diff -r d4ee703039c6 -r c497cbf5c2d5 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Oct 05 11:16:10 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Oct 04 16:33:33 2017 +0530
@@ -687,6 +687,133 @@
RET
%endif
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512 0
+ movu m0, [r0]
+ movu m1, [r0 + r3]
+ movu m2, [r0 + r3 * 2]
+ movu m3, [r0 + r4]
+ lea r0, [r0 + r3 * 4]
+
+ movu m4, [r1]
+ movu m5, [r1 + r3]
+ movu m6, [r1 + r3 * 2]
+ movu m7, [r1 + r4]
+ lea r1, [r1 + r3 * 4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+ lea r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512_END 0
+ movu m0, [r0]
+ movu m1, [r0 + r3]
+ movu m2, [r0 + r3 * 2]
+ movu m3, [r0 + r4]
+
+ movu m4, [r1]
+ movu m5, [r1 + r3]
+ movu m6, [r1 + r3 * 2]
+ movu m7, [r1 + r4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r3]
+ pmovzxbw m2, [r0 + r3 * 2]
+ pmovzxbw m3, [r0 + r4]
+ lea r0, [r0 + r3 * 4]
+
+ pmovzxbw m4, [r1]
+ pmovzxbw m5, [r1 + r3]
+ pmovzxbw m6, [r1 + r3 * 2]
+ pmovzxbw m7, [r1 + r4]
+ lea r1, [r1 + r3 * 4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3 * 2], m1
+ lea r2, [r2 + r3 * 4]
+ movu [r2], m2
+ movu [r2 + r3 * 2], m3
+ lea r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512_END 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r3]
+ pmovzxbw m2, [r0 + r3 * 2]
+ pmovzxbw m3, [r0 + r4]
+
+ pmovzxbw m4, [r1]
+ pmovzxbw m5, [r1 + r3]
+ pmovzxbw m6, [r1 + r3 * 2]
+ pmovzxbw m7, [r1 + r4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3 * 2], m1
+ lea r2, [r2 + r3 * 4]
+ movu [r2], m2
+ movu [r2 + r3 * 2], m3
+%endmacro
+
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal getResidual_aligned32, 4,5,8
+ add r3, r3
+ lea r4, [r3 * 3]
+
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512_END
+ RET
+%else
+INIT_ZMM avx512
+cglobal getResidual_aligned32, 4,5,8
+ lea r4, [r3 * 3]
+
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512_END
+ RET
+%endif
;-----------------------------------------------------------------------------
; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
diff -r d4ee703039c6 -r c497cbf5c2d5 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Oct 05 11:16:10 2017 +0530
+++ b/source/test/pixelharness.cpp Wed Oct 04 16:33:33 2017 +0530
@@ -226,6 +226,31 @@
return true;
}
+bool PixelHarness::check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt)
+{
+ ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+ memset(ref_dest, 0, 64 * 64 * sizeof(int16_t));
+ memset(opt_dest, 0, 64 * 64 * sizeof(int16_t));
+
+ int j = 0;
+ intptr_t stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index = i % TEST_CASES;
+ checked(opt, pbuf1 + j, pixel_test_buff[index] + j, opt_dest, stride);
+ ref(pbuf1 + j, pixel_test_buff[index] + j, ref_dest, stride);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
{
int j = 0;
@@ -2452,6 +2477,15 @@
}
}
+ if (opt.cu[i].calcresidual_aligned)
+ {
+ if (!check_calresidual_aligned(ref.cu[i].calcresidual_aligned, opt.cu[i].calcresidual_aligned))
+ {
+ printf("calcresidual_aligned width: %d failed!\n", 4 << i);
+ return false;
+ }
+ }
+
if (opt.cu[i].transpose)
{
if (!check_transpose(ref.cu[i].transpose, opt.cu[i].transpose))
@@ -3108,6 +3142,12 @@
REPORT_SPEEDUP(opt.cu[i].calcresidual, ref.cu[i].calcresidual, pbuf1, pbuf2, sbuf1, 64);
}
+ if (opt.cu[i].calcresidual_aligned)
+ {
+ HEADER("residual[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].calcresidual_aligned, ref.cu[i].calcresidual_aligned, pbuf1, pbuf2, sbuf1, 64);
+ }
+
if (opt.cu[i].blockfill_s)
{
HEADER("blkfill[%dx%d]", 4 << i, 4 << i);
diff -r d4ee703039c6 -r c497cbf5c2d5 source/test/pixelharness.h
--- a/source/test/pixelharness.h Thu Oct 05 11:16:10 2017 +0530
+++ b/source/test/pixelharness.h Wed Oct 04 16:33:33 2017 +0530
@@ -87,6 +87,7 @@
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
+ bool check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt);
bool check_transpose(transpose_t ref, transpose_t opt);
bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
More information about the x265-devel
mailing list