<div dir="ltr">If it is only 64x64, then definitely it is range issue when we are finally accumulating sum of all sad calculations. It make more obvious with 64x64 because more number of accumulation is here. Algorithm issue must have reflected in other partition also. <div><br></div><div>Regards,</div><div>Praveen <br></div></div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Jan 9, 2015 at 4:05 PM, Steve Borho <span dir="ltr"><<a href="mailto:steve@borho.org" target="_blank">steve@borho.org</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="">On 01/09, Divya Manivannan wrote:<br>
> # HG changeset patch<br>
> # User Divya Manivannan <<a href="mailto:divya@multicorewareinc.com">divya@multicorewareinc.com</a>><br>
> # Date 1420790181 -19800<br>
> # Fri Jan 09 13:26:21 2015 +0530<br>
> # Node ID 0f4b677cea64254d0b8f77ccc84c785bf832698d<br>
> # Parent c99e1a309bd1690be9a0a407050d97d95ccab05a<br>
> add testbench for psyCost_ss and asm for psyCost_ss_4x4: improve 1989c->515c<br>
<br>
</span>I get an error with a 10bit build:<br>
<br>
steve@zeppelin> ./test/TestBench<br>
Using random seed 54AFAEC9 16bpp<br>
Testing primitives: SSE2<br>
Testing primitives: SSE3<br>
Testing primitives: SSSE3<br>
Testing primitives: SSE4<br>
<br>
psy_cost_ss[64x64] failed!<br>
<div class="HOEnZb"><div class="h5"><br>
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/asm-primitives.cpp<br>
> --- a/source/common/x86/asm-primitives.cpp Fri Jan 09 13:09:39 2015 +0530<br>
> +++ b/source/common/x86/asm-primitives.cpp Fri Jan 09 13:26:21 2015 +0530<br>
> @@ -1430,6 +1430,7 @@<br>
> p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;<br>
> p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;<br>
> #endif<br>
> + p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;<br>
> }<br>
> if (cpuMask & X265_CPU_XOP)<br>
> {<br>
> @@ -1716,6 +1717,7 @@<br>
> p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;<br>
> p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;<br>
> #endif<br>
> + p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;<br>
> }<br>
> if (cpuMask & X265_CPU_AVX)<br>
> {<br>
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel-a.asm<br>
> --- a/source/common/x86/pixel-a.asm Fri Jan 09 13:09:39 2015 +0530<br>
> +++ b/source/common/x86/pixel-a.asm Fri Jan 09 13:26:21 2015 +0530<br>
> @@ -7569,3 +7569,157 @@<br>
> RET<br>
> %endif ; HIGH_BIT_DEPTH<br>
> %endif<br>
> +<br>
> +;---------------------------------------------------------------------------------------------------------------------<br>
> +;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)<br>
> +;---------------------------------------------------------------------------------------------------------------------<br>
> +INIT_XMM sse4<br>
> +cglobal psyCost_ss_4x4, 4, 5, 8<br>
> +<br>
> + add r1, r1<br>
> + lea r4, [3 * r1]<br>
> + movddup m0, [r0]<br>
> + movddup m1, [r0 + r1]<br>
> + movddup m2, [r0 + r1 * 2]<br>
> + movddup m3, [r0 + r4]<br>
> +<br>
> + pabsw m4, m0<br>
> + pabsw m5, m1<br>
> + paddw m5, m4<br>
> + pabsw m4, m2<br>
> + paddw m5, m4<br>
> + pabsw m4, m3<br>
> + paddw m5, m4<br>
> + pmaddwd m5, [pw_1]<br>
> + psrldq m4, m5, 4<br>
> + paddd m5, m4<br>
> + psrld m6, m5, 2<br>
> +<br>
> + mova m4, [hmul_8w]<br>
> + pmaddwd m0, m4<br>
> + pmaddwd m1, m4<br>
> + pmaddwd m2, m4<br>
> + pmaddwd m3, m4<br>
> +<br>
> + psrldq m4, m0, 4<br>
> + psubd m5, m0, m4<br>
> + paddd m0, m4<br>
> + shufps m0, m5, 10001000b<br>
> +<br>
> + psrldq m4, m1, 4<br>
> + psubd m5, m1, m4<br>
> + paddd m1, m4<br>
> + shufps m1, m5, 10001000b<br>
> +<br>
> + psrldq m4, m2, 4<br>
> + psubd m5, m2, m4<br>
> + paddd m2, m4<br>
> + shufps m2, m5, 10001000b<br>
> +<br>
> + psrldq m4, m3, 4<br>
> + psubd m5, m3, m4<br>
> + paddd m3, m4<br>
> + shufps m3, m5, 10001000b<br>
> +<br>
> + mova m4, m0<br>
> + paddd m0, m1<br>
> + psubd m1, m4<br>
> + mova m4, m2<br>
> + paddd m2, m3<br>
> + psubd m3, m4<br>
> + mova m4, m0<br>
> + paddd m0, m2<br>
> + psubd m2, m4<br>
> + mova m4, m1<br>
> + paddd m1, m3<br>
> + psubd m3, m4<br>
> +<br>
> + pabsd m0, m0<br>
> + pabsd m2, m2<br>
> + pabsd m1, m1<br>
> + pabsd m3, m3<br>
> + paddd m0, m2<br>
> + paddd m1, m3<br>
> + paddd m0, m1<br>
> + movhlps m1, m0<br>
> + paddd m0, m1<br>
> + psrldq m1, m0, 4<br>
> + paddd m0, m1<br>
> + psrld m0, 1<br>
> + psubd m7, m0, m6<br>
> +<br>
> + add r3, r3<br>
> + lea r4, [3 * r3]<br>
> + movddup m0, [r2]<br>
> + movddup m1, [r2 + r3]<br>
> + movddup m2, [r2 + r3 * 2]<br>
> + movddup m3, [r2 + r4]<br>
> +<br>
> + pabsw m4, m0<br>
> + pabsw m5, m1<br>
> + paddw m5, m4<br>
> + pabsw m4, m2<br>
> + paddw m5, m4<br>
> + pabsw m4, m3<br>
> + paddw m5, m4<br>
> + pmaddwd m5, [pw_1]<br>
> + psrldq m4, m5, 4<br>
> + paddd m5, m4<br>
> + psrld m6, m5, 2<br>
> +<br>
> + mova m4, [hmul_8w]<br>
> + pmaddwd m0, m4<br>
> + pmaddwd m1, m4<br>
> + pmaddwd m2, m4<br>
> + pmaddwd m3, m4<br>
> +<br>
> + psrldq m4, m0, 4<br>
> + psubd m5, m0, m4<br>
> + paddd m0, m4<br>
> + shufps m0, m5, 10001000b<br>
> +<br>
> + psrldq m4, m1, 4<br>
> + psubd m5, m1, m4<br>
> + paddd m1, m4<br>
> + shufps m1, m5, 10001000b<br>
> +<br>
> + psrldq m4, m2, 4<br>
> + psubd m5, m2, m4<br>
> + paddd m2, m4<br>
> + shufps m2, m5, 10001000b<br>
> +<br>
> + psrldq m4, m3, 4<br>
> + psubd m5, m3, m4<br>
> + paddd m3, m4<br>
> + shufps m3, m5, 10001000b<br>
> +<br>
> + mova m4, m0<br>
> + paddd m0, m1<br>
> + psubd m1, m4<br>
> + mova m4, m2<br>
> + paddd m2, m3<br>
> + psubd m3, m4<br>
> + mova m4, m0<br>
> + paddd m0, m2<br>
> + psubd m2, m4<br>
> + mova m4, m1<br>
> + paddd m1, m3<br>
> + psubd m3, m4<br>
> +<br>
> + pabsd m0, m0<br>
> + pabsd m2, m2<br>
> + pabsd m1, m1<br>
> + pabsd m3, m3<br>
> + paddd m0, m2<br>
> + paddd m1, m3<br>
> + paddd m0, m1<br>
> + movhlps m1, m0<br>
> + paddd m0, m1<br>
> + psrldq m1, m0, 4<br>
> + paddd m0, m1<br>
> + psrld m0, 1<br>
> + psubd m0, m6<br>
> + psubd m7, m0<br>
> + pabsd m0, m7<br>
> + movd eax, m0<br>
> + RET<br>
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel.h<br>
> --- a/source/common/x86/pixel.h Fri Jan 09 13:09:39 2015 +0530<br>
> +++ b/source/common/x86/pixel.h Fri Jan 09 13:26:21 2015 +0530<br>
> @@ -223,6 +223,7 @@<br>
> int x265_psyCost_pp_16x16_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);<br>
> int x265_psyCost_pp_32x32_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);<br>
> int x265_psyCost_pp_64x64_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);<br>
> +int x265_psyCost_ss_4x4_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);<br>
><br>
> #undef DECL_PIXELS<br>
> #undef DECL_HEVC_SSD<br>
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.cpp<br>
> --- a/source/test/pixelharness.cpp Fri Jan 09 13:09:39 2015 +0530<br>
> +++ b/source/test/pixelharness.cpp Fri Jan 09 13:26:21 2015 +0530<br>
> @@ -1089,6 +1089,28 @@<br>
> return true;<br>
> }<br>
><br>
> +bool PixelHarness::check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)<br>
> +{<br>
> + int j = 0, index1, index2, optres, refres;<br>
> + intptr_t stride = STRIDE;<br>
> +<br>
> + for (int i = 0; i < ITERS; i++)<br>
> + {<br>
> + index1 = rand() % TEST_CASES;<br>
> + index2 = rand() % TEST_CASES;<br>
> + optres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);<br>
> + refres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);<br>
> +<br>
> + if (optres != refres)<br>
> + return false;<br>
> +<br>
> + reportfail();<br>
> + j += INCR;<br>
> + }<br>
> +<br>
> + return true;<br>
> +}<br>
> +<br>
> bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt)<br>
> {<br>
> ALIGN_VAR_16(pixel, ref_dest[64 * 64]);<br>
> @@ -1470,6 +1492,15 @@<br>
> return false;<br>
> }<br>
> }<br>
> +<br>
> + if (opt.psy_cost_ss[i])<br>
> + {<br>
> + if (!check_psyCost_ss(ref.psy_cost_ss[i], opt.psy_cost_ss[i]))<br>
> + {<br>
> + printf("\npsy_cost_ss[%dx%d] failed!\n", 4 << i, 4 << i);<br>
> + return false;<br>
> + }<br>
> + }<br>
> }<br>
><br>
> if (opt.weight_pp)<br>
> @@ -1862,6 +1893,12 @@<br>
> HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);<br>
> REPORT_SPEEDUP(opt.psy_cost_pp[i], ref.psy_cost_pp[i], pbuf1, STRIDE, pbuf2, STRIDE);<br>
> }<br>
> +<br>
> + if (opt.psy_cost_ss[i])<br>
> + {<br>
> + HEADER("psy_cost_ss[%dx%d]", 4 << i, 4 << i);<br>
> + REPORT_SPEEDUP(opt.psy_cost_ss[i], ref.psy_cost_ss[i], sbuf1, STRIDE, sbuf2, STRIDE);<br>
> + }<br>
> }<br>
><br>
> if (opt.weight_pp)<br>
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.h<br>
> --- a/source/test/pixelharness.h Fri Jan 09 13:09:39 2015 +0530<br>
> +++ b/source/test/pixelharness.h Fri Jan 09 13:26:21 2015 +0530<br>
> @@ -101,6 +101,7 @@<br>
> bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);<br>
> bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);<br>
> bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);<br>
> + bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);<br>
> bool check_calSign(sign_t ref, sign_t opt);<br>
><br>
> public:<br>
> _______________________________________________<br>
> x265-devel mailing list<br>
> <a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
> <a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br>
</div></div><span class="HOEnZb"><font color="#888888">--<br>
Steve Borho<br>
</font></span><div class="HOEnZb"><div class="h5">_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</div></div></blockquote></div><br></div>