[x265] [PATCH] add testbench for psyCost_ss and asm for psyCost_ss_4x4: improve 1989c->515c
Divya Manivannan
divya at multicorewareinc.com
Fri Jan 9 08:58:04 CET 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1420790181 -19800
# Fri Jan 09 13:26:21 2015 +0530
# Node ID 0f4b677cea64254d0b8f77ccc84c785bf832698d
# Parent c99e1a309bd1690be9a0a407050d97d95ccab05a
add testbench for psyCost_ss and asm for psyCost_ss_4x4: improve 1989c->515c
diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jan 09 13:09:39 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jan 09 13:26:21 2015 +0530
@@ -1430,6 +1430,7 @@
p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
#endif
+ p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;
}
if (cpuMask & X265_CPU_XOP)
{
@@ -1716,6 +1717,7 @@
p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
#endif
+ p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Fri Jan 09 13:09:39 2015 +0530
+++ b/source/common/x86/pixel-a.asm Fri Jan 09 13:26:21 2015 +0530
@@ -7569,3 +7569,157 @@
RET
%endif ; HIGH_BIT_DEPTH
%endif
+
+;---------------------------------------------------------------------------------------------------------------------
+;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
+;---------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal psyCost_ss_4x4, 4, 5, 8
+
+ add r1, r1
+ lea r4, [3 * r1]
+ movddup m0, [r0]
+ movddup m1, [r0 + r1]
+ movddup m2, [r0 + r1 * 2]
+ movddup m3, [r0 + r4]
+
+ pabsw m4, m0
+ pabsw m5, m1
+ paddw m5, m4
+ pabsw m4, m2
+ paddw m5, m4
+ pabsw m4, m3
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m6, m5, 2
+
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ psrldq m4, m0, 4
+ psubd m5, m0, m4
+ paddd m0, m4
+ shufps m0, m5, 10001000b
+
+ psrldq m4, m1, 4
+ psubd m5, m1, m4
+ paddd m1, m4
+ shufps m1, m5, 10001000b
+
+ psrldq m4, m2, 4
+ psubd m5, m2, m4
+ paddd m2, m4
+ shufps m2, m5, 10001000b
+
+ psrldq m4, m3, 4
+ psubd m5, m3, m4
+ paddd m3, m4
+ shufps m3, m5, 10001000b
+
+ mova m4, m0
+ paddd m0, m1
+ psubd m1, m4
+ mova m4, m2
+ paddd m2, m3
+ psubd m3, m4
+ mova m4, m0
+ paddd m0, m2
+ psubd m2, m4
+ mova m4, m1
+ paddd m1, m3
+ psubd m3, m4
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ psrld m0, 1
+ psubd m7, m0, m6
+
+ add r3, r3
+ lea r4, [3 * r3]
+ movddup m0, [r2]
+ movddup m1, [r2 + r3]
+ movddup m2, [r2 + r3 * 2]
+ movddup m3, [r2 + r4]
+
+ pabsw m4, m0
+ pabsw m5, m1
+ paddw m5, m4
+ pabsw m4, m2
+ paddw m5, m4
+ pabsw m4, m3
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m6, m5, 2
+
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+
+ psrldq m4, m0, 4
+ psubd m5, m0, m4
+ paddd m0, m4
+ shufps m0, m5, 10001000b
+
+ psrldq m4, m1, 4
+ psubd m5, m1, m4
+ paddd m1, m4
+ shufps m1, m5, 10001000b
+
+ psrldq m4, m2, 4
+ psubd m5, m2, m4
+ paddd m2, m4
+ shufps m2, m5, 10001000b
+
+ psrldq m4, m3, 4
+ psubd m5, m3, m4
+ paddd m3, m4
+ shufps m3, m5, 10001000b
+
+ mova m4, m0
+ paddd m0, m1
+ psubd m1, m4
+ mova m4, m2
+ paddd m2, m3
+ psubd m3, m4
+ mova m4, m0
+ paddd m0, m2
+ psubd m2, m4
+ mova m4, m1
+ paddd m1, m3
+ psubd m3, m4
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ psrld m0, 1
+ psubd m0, m6
+ psubd m7, m0
+ pabsd m0, m7
+ movd eax, m0
+ RET
diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Fri Jan 09 13:09:39 2015 +0530
+++ b/source/common/x86/pixel.h Fri Jan 09 13:26:21 2015 +0530
@@ -223,6 +223,7 @@
int x265_psyCost_pp_16x16_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
int x265_psyCost_pp_32x32_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
int x265_psyCost_pp_64x64_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+int x265_psyCost_ss_4x4_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Fri Jan 09 13:09:39 2015 +0530
+++ b/source/test/pixelharness.cpp Fri Jan 09 13:26:21 2015 +0530
@@ -1089,6 +1089,28 @@
return true;
}
+bool PixelHarness::check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
+{
+ int j = 0, index1, index2, optres, refres;
+ intptr_t stride = STRIDE;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ index1 = rand() % TEST_CASES;
+ index2 = rand() % TEST_CASES;
+ optres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+ refres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+
+ if (optres != refres)
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1470,6 +1492,15 @@
return false;
}
}
+
+ if (opt.psy_cost_ss[i])
+ {
+ if (!check_psyCost_ss(ref.psy_cost_ss[i], opt.psy_cost_ss[i]))
+ {
+ printf("\npsy_cost_ss[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
}
if (opt.weight_pp)
@@ -1862,6 +1893,12 @@
HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
REPORT_SPEEDUP(opt.psy_cost_pp[i], ref.psy_cost_pp[i], pbuf1, STRIDE, pbuf2, STRIDE);
}
+
+ if (opt.psy_cost_ss[i])
+ {
+ HEADER("psy_cost_ss[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.psy_cost_ss[i], ref.psy_cost_ss[i], sbuf1, STRIDE, sbuf2, STRIDE);
+ }
}
if (opt.weight_pp)
diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.h
--- a/source/test/pixelharness.h Fri Jan 09 13:09:39 2015 +0530
+++ b/source/test/pixelharness.h Fri Jan 09 13:26:21 2015 +0530
@@ -101,6 +101,7 @@
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
+ bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
bool check_calSign(sign_t ref, sign_t opt);
public:
More information about the x265-devel
mailing list