<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Wed, Feb 5, 2014 at 3:33 AM, <span dir="ltr"><<a href="mailto:yuvaraj@multicorewareinc.com" target="_blank">yuvaraj@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"># HG changeset patch<br>
# User Yuvaraj Venkatesh <<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>><br>
# Date 1391592757 -19800<br>
# Wed Feb 05 15:02:37 2014 +0530<br>
# Node ID b14a8528c478bf3068ed95aeef68c050014785cd<br>
# Parent 1374f1168c5cbb97a893172e37bd9f5c6ed5690c<br>
asm: modified satd and sad asm functions in 16bpp to avoid overflow<br></blockquote><div><br></div><div>this patch causes 8bpp testbench failures</div><div><br></div><div>
<p class="">steve@<span class="">zeppelin</span>> ./test/TestBench <span class="">~/repos/x265/build/linux</span></p>
<p class="">Using random seed 52F28F46 8bpp</p>
<p class="">Testing primitives: SSE2</p>
<p class="">Testing primitives: SSE3</p>
<p class="">Testing primitives: SSSE3</p>
<p class="">sa8d_inter[12x16]: failed!</p></div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
<br>
diff -r 1374f1168c5c -r b14a8528c478 source/common/x86/pixel-a.asm<br>
--- a/source/common/x86/pixel-a.asm Wed Feb 05 12:59:57 2014 +0530<br>
+++ b/source/common/x86/pixel-a.asm Wed Feb 05 15:02:37 2014 +0530<br>
@@ -511,7 +511,7 @@<br>
%endif<br>
%endmacro<br>
<br>
-%macro SATD_4x8_SSE 3<br>
+%macro SATD_4x8_SSE 3-4<br>
%if HIGH_BIT_DEPTH<br>
movh m0, [r0+0*r1]<br>
movh m4, [r2+0*r3]<br>
@@ -577,7 +577,11 @@<br>
DIFFOP 2, 6, 3, 5, 7<br>
%endif<br>
%endif ; HIGH_BIT_DEPTH<br>
+%if %0 == 4<br>
+ SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4<br>
+%else<br>
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3<br>
+%endif<br>
%endmacro<br>
<br>
;-----------------------------------------------------------------------------<br>
@@ -2391,56 +2395,66 @@<br>
SATD_START_MMX<br>
mov r6, r0<br>
mov r7, r2<br>
+ pxor m7, m7<br>
%if vertical==0<br>
mova m7, [hmul_4p]<br>
%endif<br>
- SATD_4x8_SSE vertical, 0, swap<br>
+ SATD_4x8_SSE vertical, 0, 4, 5<br>
lea r0, [r0 + r1*2*SIZEOF_PIXEL]<br>
lea r2, [r2 + r3*2*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r6 + 4*SIZEOF_PIXEL]<br>
lea r2, [r7 + 4*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r0 + r1*2*SIZEOF_PIXEL]<br>
lea r2, [r2 + r3*2*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r6 + 8*SIZEOF_PIXEL]<br>
lea r2, [r7 + 8*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r0 + r1*2*SIZEOF_PIXEL]<br>
lea r2, [r2 + r3*2*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
- HADDW m7, m1<br>
- movd eax, m7<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
+ pxor m1, m1<br>
+ movhlps m1, m7<br>
+ paddd m7, m1<br>
+ pshufd m1, m7, 1<br>
+ paddd m7, m1<br>
+ movd eax, m7<br>
RET<br>
%else<br>
cglobal pixel_satd_12x16, 4,7,8,0-gprsize<br>
SATD_START_MMX<br>
mov r6, r0<br>
mov [rsp], r2<br>
+ pxor m7, m7<br>
%if vertical==0<br>
mova m7, [hmul_4p]<br>
%endif<br>
- SATD_4x8_SSE vertical, 0, swap<br>
+ SATD_4x8_SSE vertical, 0, 4, 5<br>
lea r0, [r0 + r1*2*SIZEOF_PIXEL]<br>
lea r2, [r2 + r3*2*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r6 + 4*SIZEOF_PIXEL]<br>
mov r2, [rsp]<br>
add r2, 4*SIZEOF_PIXEL<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r0 + r1*2*SIZEOF_PIXEL]<br>
lea r2, [r2 + r3*2*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r6 + 8*SIZEOF_PIXEL]<br>
mov r2, [rsp]<br>
add r2, 8*SIZEOF_PIXEL<br>
- SATD_4x8_SSE vertical, 1, add<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
lea r0, [r0 + r1*2*SIZEOF_PIXEL]<br>
lea r2, [r2 + r3*2*SIZEOF_PIXEL]<br>
- SATD_4x8_SSE vertical, 1, add<br>
- HADDW m7, m1<br>
- movd eax, m7<br>
+ SATD_4x8_SSE vertical, 1, 4, 5<br>
+ pxor m1, m1<br>
+ movhlps m1, m7<br>
+ paddd m7, m1<br>
+ pshufd m1, m7, 1<br>
+ paddd m7, m1<br>
+ movd eax, m7<br>
RET<br>
%endif<br>
<br>
diff -r 1374f1168c5c -r b14a8528c478 source/common/x86/sad16-a.asm<br>
--- a/source/common/x86/sad16-a.asm Wed Feb 05 12:59:57 2014 +0530<br>
+++ b/source/common/x86/sad16-a.asm Wed Feb 05 15:02:37 2014 +0530<br>
@@ -274,9 +274,10 @@<br>
lea r0, [r0+4*r1]<br>
lea r2, [r2+4*r3]<br>
ABSW2 m3, m4, m3, m4, m7, m5<br>
- paddd m1, m2<br>
- paddd m3, m4<br>
- paddd m0, m1<br>
+ paddw m1, m2<br>
+ paddw m3, m4<br>
+ paddw m3, m1<br>
+ pmaddwd m3, [pw_1]<br>
paddd m0, m3<br>
%else<br>
movu m1, [r2]<br>
@@ -286,8 +287,9 @@<br>
ABSW2 m1, m2, m1, m2, m3, m4<br>
lea r0, [r0+4*r1]<br>
lea r2, [r2+4*r3]<br>
- paddw m0, m1<br>
- paddw m0, m2<br>
+ paddw m2, m1<br>
+ pmaddwd m2, [pw_1]<br>
+ paddd m0, m2<br>
%endif<br>
%endmacro<br>
<br>
@@ -308,7 +310,7 @@<br>
jg .loop<br>
%endif<br>
<br>
- HADDW m0, m1<br>
+ HADDD m0, m1<br>
movd eax, xm0<br>
RET<br>
%endmacro<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>