[x265] [PATCH Review only] asm & testbench: psyCost_pp_4x4 in sse4: improve 2088c->337c
Divya Manivannan
divya at multicorewareinc.com
Mon Dec 29 09:19:58 CET 2014
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1419841142 -19800
# Mon Dec 29 13:49:02 2014 +0530
# Node ID b2960999295f668030756deb53ce08a50e7af7ca
# Parent 1bf769c6953d7c4f660d26a8618083ac1c0885e5
asm & testbench: psyCost_pp_4x4 in sse4: improve 2088c->337c
diff -r 1bf769c6953d -r b2960999295f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 24 12:31:27 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 29 13:49:02 2014 +0530
@@ -1434,6 +1434,8 @@
INTRA_ANG_SSE4_COMMON(sse4);
INTRA_ANG_SSE4_HIGH(sse4);
+
+ p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
}
if (cpuMask & X265_CPU_XOP)
{
@@ -1715,6 +1717,8 @@
p.dct[DCT_8x8] = x265_dct8_sse4;
// p.denoiseDct = x265_denoise_dct_sse4;
+
+ p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 1bf769c6953d -r b2960999295f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 24 12:31:27 2014 +0530
+++ b/source/common/x86/pixel-a.asm Mon Dec 29 13:49:02 2014 +0530
@@ -41,6 +41,8 @@
hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
+hmul_8w: times 4 dw 1
+ times 2 dw 1, -1
ALIGN 32
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
@@ -6579,3 +6581,165 @@
mov [r2], r3w
.end:
RET
+
+%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
+%if cpuflag(ssse3)
+ pabsd %1, %3
+ pabsd %2, %4
+%elifidn %1, %3
+ pxor %5, %5
+ pxor %6, %6
+ psubd %5, %1
+ psubd %6, %2
+ pmaxsd %1, %5
+ pmaxsd %2, %6
+%else
+ pxor %1, %1
+ pxor %2, %2
+ psubd %1, %3
+ psubd %2, %4
+ pmaxsd %1, %3
+ pmaxsd %2, %4
+%endif
+%endmacro
+
+;---------------------------------------------------------------------------------------------------------------------
+;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+;---------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal psyCost_pp_4x4, 4, 5, 8
+
+%if HIGH_BIT_DEPTH
+ FIX_STRIDES r1, r3
+ lea r4, [3 * r1]
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ paddd m5, m0, m1
+ paddd m5, m2
+ paddd m5, m3
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m5, 2
+
+ SUMSUB_BA d, 0, 1, 4
+ SUMSUB_BA d, 2, 3, 4
+ SUMSUB_BA d, 0, 2, 4
+ SUMSUB_BA d, 1, 3, 4
+ %define ORDER unord
+ TRANS q, ORDER, 0, 2, 4, 6
+ TRANS q, ORDER, 1, 3, 4, 6
+ ABSD2 m0, m2, m0, m2, m4, m6
+ pmaxsd m0, m2
+ ABSD2 m1, m3, m1, m3, m4, m6
+ pmaxsd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+
+ psubd m7, m0, m5
+
+ lea r4, [3 * r3]
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r4]
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ paddd m5, m0, m1
+ paddd m5, m2
+ paddd m5, m3
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m5, 2
+
+ SUMSUB_BA d, 0, 1, 4
+ SUMSUB_BA d, 2, 3, 4
+ SUMSUB_BA d, 0, 2, 4
+ SUMSUB_BA d, 1, 3, 4
+ %define ORDER unord
+ TRANS q, ORDER, 0, 2, 4, 6
+ TRANS q, ORDER, 1, 3, 4, 6
+ ABSD2 m0, m2, m0, m2, m4, m6
+ pmaxsd m0, m2
+ ABSD2 m1, m3, m1, m3, m4, m6
+ pmaxsd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+
+ psubd m0, m5
+
+ psubd m7, m0
+ pabsd m0, m7
+ movd eax, m0
+
+%else ; !HIGH_BIT_DEPTH
+ lea r4, [3 * r1]
+ movd m0, [r0]
+ movd m1, [r0 + r1]
+ movd m2, [r0 + r1 * 2]
+ movd m3, [r0 + r4]
+ shufps m0, m1, 0
+ shufps m2, m3, 0
+ mova m4, [hmul_4p]
+ pmaddubsw m0, m4
+ pmaddubsw m2, m4
+
+ paddw m5, m0, m2
+ movhlps m4, m5
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrld m5, 2
+
+ HADAMARD 0, sumsub, 0, 2, 1, 3
+ HADAMARD 4, sumsub, 0, 2, 1, 3
+ HADAMARD 1, amax, 0, 2, 1, 3
+ HADDW m0, m2
+
+ psubd m6, m0, m5
+
+ lea r4, [3 * r3]
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ movd m2, [r2 + r3 * 2]
+ movd m3, [r2 + r4]
+ shufps m0, m1, 0
+ shufps m2, m3, 0
+ mova m4, [hmul_4p]
+ pmaddubsw m0, m4
+ pmaddubsw m2, m4
+
+ paddw m5, m0, m2
+ movhlps m4, m5
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrld m5, 2
+
+ HADAMARD 0, sumsub, 0, 2, 1, 3
+ HADAMARD 4, sumsub, 0, 2, 1, 3
+ HADAMARD 1, amax, 0, 2, 1, 3
+ HADDW m0, m2
+
+ psubd m0, m5
+
+ psubd m6, m0
+ pabsd m0, m6
+ movd eax, m0
+%endif ; HIGH_BIT_DEPTH
+ RET
diff -r 1bf769c6953d -r b2960999295f source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Dec 24 12:31:27 2014 +0530
+++ b/source/common/x86/pixel.h Mon Dec 29 13:49:02 2014 +0530
@@ -218,6 +218,7 @@
void x265_downShift_16_sse2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void x265_upShift_8_sse4(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+int x265_psyCost_pp_4x4_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
diff -r 1bf769c6953d -r b2960999295f source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Dec 24 12:31:27 2014 +0530
+++ b/source/test/pixelharness.cpp Mon Dec 29 13:49:02 2014 +0530
@@ -948,6 +948,28 @@
return true;
}
+bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
+{
+ int j = 0, index1, index2, optres, refres;
+ intptr_t stride = STRIDE;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ index1 = rand() % TEST_CASES;
+ index2 = rand() % TEST_CASES;
+ optres = (int)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+ refres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+
+ if (optres != refres)
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.satd[part])
@@ -1290,6 +1312,15 @@
return false;
}
}
+
+ if (opt.psy_cost_pp[i])
+ {
+ if (!check_psyCost_pp(ref.psy_cost_pp[i], opt.psy_cost_pp[i]))
+ {
+ printf("\npsy_cost_pp[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
}
if (opt.weight_pp)
@@ -1631,6 +1662,12 @@
HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
}
+
+ if (opt.psy_cost_pp[i])
+ {
+ HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.psy_cost_pp[i], ref.psy_cost_pp[i], pbuf1, STRIDE, pbuf2, STRIDE);
+ }
}
if (opt.weight_pp)
diff -r 1bf769c6953d -r b2960999295f source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Dec 24 12:31:27 2014 +0530
+++ b/source/test/pixelharness.h Mon Dec 29 13:49:02 2014 +0530
@@ -92,6 +92,7 @@
bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
+ bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
public:
More information about the x265-devel
mailing list