[x265] [PATCH 271 of 307] Aligned routine implementation of ssd_s primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:29 CEST 2018
# HG changeset patch
# User Jayashree
# Date 1513585304 -19800
# Mon Dec 18 13:51:44 2017 +0530
# Node ID fd28f49cb7b30aab97105a59ec841812af205cb9
# Parent 265fd2e1e49587837ebed4e7efcc38a0f6e79346
Aligned routine implementation of ssd_s primitive
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/pixel.cpp Mon Dec 18 13:51:44 2017 +0530
@@ -1009,7 +1009,8 @@
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose<W>; \
- p.cu[BLOCK_ ## W ## x ## H].ssd_s = pixel_ssd_s_c<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].ssd_s[NONALIGNED] = pixel_ssd_s_c<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].ssd_s[ALIGNED] = pixel_ssd_s_c<W>; \
p.cu[BLOCK_ ## W ## x ## H].var = pixel_var<W>; \
p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED] = getResidual<W>; \
p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED] = getResidual<W>; \
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/primitives.h
--- a/source/common/primitives.h Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/primitives.h Mon Dec 18 13:51:44 2017 +0530
@@ -290,9 +290,8 @@
pixel_sse_t sse_pp; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
pixel_sse_ss_t sse_ss; // Sum of Square Error (short, short) fenc alignment not assumed
pixelcmp_t psy_cost_pp; // difference in AC energy between two pixel blocks
- pixel_ssd_s_t ssd_s; // Sum of Square Error (residual coeff to self)
+ pixel_ssd_s_t ssd_s[NUM_ALIGNMENT_TYPES]; // Sum of Square Error (residual coeff to self)
pixelcmp_t sa8d; // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
-
transpose_t transpose; // transpose pixel block; for use with intra all-angs
intra_allangs_t intra_pred_allangs;
intra_filter_t intra_filter;
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 18 13:51:44 2017 +0530
@@ -1015,10 +1015,10 @@
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
#if X86_64
- p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4_sse2);
- p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8_sse2);
- p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_sse2);
- p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_sse2);
+ ASSIGN2(p.cu[BLOCK_4x4].ssd_s,pixel_ssd_s_4_sse2 );
+ ASSIGN2(p.cu[BLOCK_8x8].ssd_s,pixel_ssd_s_8_sse2);
+ ASSIGN2(p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_sse2);
+ ASSIGN2(p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_sse2 );
#endif
ALL_LUMA_TU_S(calcresidual[ALIGNED], getResidual, sse2);
ALL_LUMA_TU_S(calcresidual[NONALIGNED], getResidual, sse2);
@@ -1681,9 +1681,8 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx2);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx2);
- p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
- p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
-
+ ASSIGN2( p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_avx2);
+ ASSIGN2( p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_avx2);
p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
@@ -2450,9 +2449,8 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
-
- p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
-
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_32_avx512);
p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx512);
p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx512);
p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
@@ -3235,10 +3233,9 @@
ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
- ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+ ALL_LUMA_TU_S(ssd_s[NONALIGNED], pixel_ssd_s_, sse2);
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
-
p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
@@ -3822,9 +3819,8 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
- p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
- p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
-
+ ASSIGN2(p.cu[BLOCK_16x16].ssd_s, pixel_ssd_s_16_avx2);
+ ASSIGN2(p.cu[BLOCK_32x32].ssd_s, pixel_ssd_s_32_avx2);
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
@@ -4880,9 +4876,10 @@
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512);
- p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
- p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512);
-
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32_avx512);
+ p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16_avx512);
+ p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_16_avx512);
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/x86/pixel.h Mon Dec 18 13:51:44 2017 +0530
@@ -54,7 +54,9 @@
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+ FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/x86/ssd-a.asm Mon Dec 18 13:51:44 2017 +0530
@@ -3489,3 +3489,116 @@
;-----------------------------------------------------------------------------
; ssd_s avx512 code end
;-----------------------------------------------------------------------------
+;-----------------------------------------------------------------------------
+;ALigned version of macro
+;-----------------------------------------------------------------------------
+%macro PROCESS_SSD_S_16x8_ALIGNED_AVX512 0
+ mova ym1, [r0]
+ vinserti32x8 m1, [r0 + r1], 1
+ mova ym2, [r0 + 2 * r1]
+ vinserti32x8 m2, [r0 + r3], 1
+ lea r0, [r0 + 4 * r1]
+ mova ym3, [r0]
+ vinserti32x8 m3, [r0 + r1], 1
+ mova ym4, [r0 + 2 * r1]
+ vinserti32x8 m4, [r0 + r3], 1
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+%endmacro
+;---------------------------------------------------------------------------------
+;int pixel_ssd_s_aligned( int16_t *ref, intptr_t i_stride )
+;-----------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_ZMM avx512
+
+INIT_ZMM avx512
+cglobal pixel_ssd_s_aligned_16, 2,4,5
+ add r1, r1
+ lea r3, [r1 * 3]
+ pxor m0, m0
+
+ PROCESS_SSD_S_16x8_ALIGNED_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_16x8_ALIGNED_AVX512
+
+ ; calculate sum and return
+ HADDD m0, m1
+ movd eax, xm0
+ RET
+%endif
+;---------------------------------------------------------------------------------------------
+; aligned implementation for 32
+;---------------------------------------------------------------------------------------------
+%macro PROCESS_SSD_S_32x8_ALIGNED_AVX512 0
+ mova m1, [r0]
+ mova m2, [r0 + r1]
+ mova m3, [r0 + 2 * r1]
+ mova m4, [r0 + r3]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ lea r0, [r0 + 4 * r1]
+
+ mova m1, [r0]
+ mova m2, [r0 + r1]
+ mova m3, [r0 + 2 * r1]
+ mova m4, [r0 + r3]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+%endmacro
+
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_ssd_s_aligned_32, 2,4,5
+ add r1, r1
+ lea r3, [r1 * 3]
+ pxor m0, m0
+
+ PROCESS_SSD_S_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_32x8_ALIGNED_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_32x8_ALIGNED_AVX512
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SSD_S_32x8_ALIGNED_AVX512
+
+ ; calculate sum and return
+%if BIT_DEPTH >= 10
+ mova m1, m0
+ pxor m2, m2
+ punpckldq m0, m2
+ punpckhdq m1, m2
+ paddq m0, m1
+ vextracti32x8 ym2, m0, 1
+ paddq ym0, ym2
+ vextracti32x4 xm2, m0, 1
+ paddq xm2, xm0
+ movhlps xm1, xm2
+ paddq xm2, xm1
+ movq rax, xm2
+%else
+ HADDD m0, m1
+ movd eax, xm0
+%endif
+ RET
+%endif
\ No newline at end of file
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Dec 13 16:51:23 2017 +0530
+++ b/source/test/pixelharness.cpp Mon Dec 18 13:51:44 2017 +0530
@@ -267,10 +267,27 @@
reportfail();
j += INCR;
}
-
return true;
}
-
+bool PixelHarness::check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
+{
+ int j = 0;
+ for (int i = 0; i < ITERS; i++)
+ {
+ // NOTE: stride must be multiple of 16, because minimum block is 4x4
+ int stride = (STRIDE + (rand() % STRIDE)) & ~15;
+ sse_t cres = ref(sbuf1 + j, stride);
+ sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
+
+ if (cres != vres)
+ return false;
+
+ reportfail();
+ j += INCR+32;
+ }
+
+ return true;
+}
bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
@@ -2619,16 +2636,22 @@
return false;
}
}
-
- if (opt.cu[i].ssd_s)
+ if (opt.cu[i].ssd_s[NONALIGNED])
{
- if (!check_ssd_s(ref.cu[i].ssd_s, opt.cu[i].ssd_s))
+ if (!check_ssd_s(ref.cu[i].ssd_s[NONALIGNED], opt.cu[i].ssd_s[NONALIGNED]))
{
printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
return false;
}
}
-
+ if (opt.cu[i].ssd_s[ALIGNED])
+ {
+ if (!check_ssd_s_aligned(ref.cu[i].ssd_s[ALIGNED], opt.cu[i].ssd_s[ALIGNED]))
+ {
+ printf("ssd_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
if (opt.cu[i].copy_cnt)
{
if (!check_copy_cnt_t(ref.cu[i].copy_cnt, opt.cu[i].copy_cnt))
@@ -3278,13 +3301,17 @@
measurePartition(part, ref, opt);
}
}
-
for (int i = 0; i < NUM_CU_SIZES; i++)
{
- if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s)
+ if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[NONALIGNED])
{
HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cu[i].ssd_s, ref.cu[i].ssd_s, sbuf1, STRIDE);
+ REPORT_SPEEDUP(opt.cu[i].ssd_s[NONALIGNED], ref.cu[i].ssd_s[NONALIGNED], sbuf1, STRIDE);
+ }
+ if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[ALIGNED])
+ {
+ HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].ssd_s[ALIGNED], ref.cu[i].ssd_s[ALIGNED], sbuf1, STRIDE);
}
if (opt.cu[i].sa8d)
{
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Dec 13 16:51:23 2017 +0530
+++ b/source/test/pixelharness.h Mon Dec 18 13:51:44 2017 +0530
@@ -87,6 +87,7 @@
bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
+ bool check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
More information about the x265-devel
mailing list