<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>it is slower since SAD is bottleneck in ME,<br>May you write a unalign version for RDO-RC?</div><pre><br>At 2014-06-04 19:55:54,yuvaraj@multicorewareinc.com wrote:
># HG changeset patch
># User Yuvaraj Venkatesh <yuvaraj@multicorewareinc.com>
># Date 1401882930 -19800
># Wed Jun 04 17:25:30 2014 +0530
># Node ID dd62f0a2a2993d102daab51237dce2cf6f445243
># Parent 61a02a9c78eb0ec2a1cdaf05562a77b37355432d
>asm: fix crash when passing unalligned buffer to sad function
>
>In psy-rd some input buffers are unalligned which causes crash
>on sad primitives, fix those crash
>
>diff -r 61a02a9c78eb -r dd62f0a2a299 source/common/x86/sad-a.asm
>--- a/source/common/x86/sad-a.asm Wed Jun 04 15:37:37 2014 +0530
>+++ b/source/common/x86/sad-a.asm Wed Jun 04 17:25:30 2014 +0530
>@@ -213,32 +213,40 @@
> %macro PROCESS_SAD_32x4 0
> movu m1, [r2]
> movu m2, [r2 + 16]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
> paddd m0, m1
> lea r2, [r2 + r3]
> lea r0, [r0 + r1]
> movu m1, [r2]
> movu m2, [r2 + 16]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
> paddd m0, m1
> lea r2, [r2 + r3]
> lea r0, [r0 + r1]
> movu m1, [r2]
> movu m2, [r2 + 16]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
> paddd m0, m1
> lea r2, [r2 + r3]
> lea r0, [r0 + r1]
> movu m1, [r2]
> movu m2, [r2 + 16]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
> paddd m0, m1
> lea r2, [r2 + r3]
>@@ -319,61 +327,77 @@
> %macro PROCESS_SAD_64x4 0
> movu m1, [r2]
> movu m2, [r2 + 16]
>- movu m3, [r2 + 32]
>- movu m4, [r2 + 48]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>- psadbw m3, [r0 + 32]
>- psadbw m4, [r0 + 48]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
>- paddd m3, m4
> paddd m0, m1
>- paddd m0, m3
>+ movu m1, [r2 + 32]
>+ movu m2, [r2 + 48]
>+ movu m3, [r0 + 32]
>+ movu m4, [r0 + 48]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
> lea r2, [r2 + r3]
> lea r0, [r0 + r1]
>
> movu m1, [r2]
> movu m2, [r2 + 16]
>- movu m3, [r2 + 32]
>- movu m4, [r2 + 48]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>- psadbw m3, [r0 + 32]
>- psadbw m4, [r0 + 48]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
>- paddd m3, m4
> paddd m0, m1
>- paddd m0, m3
>+ movu m1, [r2 + 32]
>+ movu m2, [r2 + 48]
>+ movu m3, [r0 + 32]
>+ movu m4, [r0 + 48]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
> lea r2, [r2 + r3]
> lea r0, [r0 + r1]
>
> movu m1, [r2]
> movu m2, [r2 + 16]
>- movu m3, [r2 + 32]
>- movu m4, [r2 + 48]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>- psadbw m3, [r0 + 32]
>- psadbw m4, [r0 + 48]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
>- paddd m3, m4
> paddd m0, m1
>- paddd m0, m3
>+ movu m1, [r2 + 32]
>+ movu m2, [r2 + 48]
>+ movu m3, [r0 + 32]
>+ movu m4, [r0 + 48]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
> lea r2, [r2 + r3]
> lea r0, [r0 + r1]
>
> movu m1, [r2]
> movu m2, [r2 + 16]
>- movu m3, [r2 + 32]
>- movu m4, [r2 + 48]
>- psadbw m1, [r0]
>- psadbw m2, [r0 + 16]
>- psadbw m3, [r0 + 32]
>- psadbw m4, [r0 + 48]
>+ movu m3, [r0]
>+ movu m4, [r0 + 16]
>+ psadbw m1, m3
>+ psadbw m2, m4
> paddd m1, m2
>- paddd m3, m4
> paddd m0, m1
>- paddd m0, m3
>+ movu m1, [r2 + 32]
>+ movu m2, [r2 + 48]
>+ movu m3, [r0 + 32]
>+ movu m4, [r0 + 48]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
> lea r2, [r2 + r3]
> lea r0, [r0 + r1]
> %endmacro
>@@ -382,68 +406,86 @@
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
> ;-----------------------------------------------------------------------------
>-cglobal pixel_sad_16x16, 4,4,8
>- movu m0, [r2]
>- movu m1, [r2+r3]
>- lea r2, [r2+2*r3]
>- movu m2, [r2]
>- movu m3, [r2+r3]
>- lea r2, [r2+2*r3]
>- psadbw m0, [r0]
>- psadbw m1, [r0+r1]
>- lea r0, [r0+2*r1]
>- movu m4, [r2]
>- paddw m0, m1
>- psadbw m2, [r0]
>- psadbw m3, [r0+r1]
>- lea r0, [r0+2*r1]
>- movu m5, [r2+r3]
>- lea r2, [r2+2*r3]
>- paddw m2, m3
>- movu m6, [r2]
>- movu m7, [r2+r3]
>- lea r2, [r2+2*r3]
>- paddw m0, m2
>- psadbw m4, [r0]
>- psadbw m5, [r0+r1]
>- lea r0, [r0+2*r1]
>- movu m1, [r2]
>- paddw m4, m5
>- psadbw m6, [r0]
>- psadbw m7, [r0+r1]
>- lea r0, [r0+2*r1]
>- movu m2, [r2+r3]
>- lea r2, [r2+2*r3]
>- paddw m6, m7
>- movu m3, [r2]
>- paddw m0, m4
>- movu m4, [r2+r3]
>- lea r2, [r2+2*r3]
>- paddw m0, m6
>- psadbw m1, [r0]
>- psadbw m2, [r0+r1]
>- lea r0, [r0+2*r1]
>- movu m5, [r2]
>- paddw m1, m2
>- psadbw m3, [r0]
>- psadbw m4, [r0+r1]
>- lea r0, [r0+2*r1]
>- movu m6, [r2+r3]
>- lea r2, [r2+2*r3]
>- paddw m3, m4
>- movu m7, [r2]
>- paddw m0, m1
>- movu m1, [r2+r3]
>- paddw m0, m3
>- psadbw m5, [r0]
>- psadbw m6, [r0+r1]
>- lea r0, [r0+2*r1]
>- paddw m5, m6
>- psadbw m7, [r0]
>- psadbw m1, [r0+r1]
>- paddw m7, m1
>- paddw m0, m5
>- paddw m0, m7
>+cglobal pixel_sad_16x16, 4,4,5
>+ pxor m0, m0
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
>+ lea r2, [r2 + r3 * 2]
>+ lea r0, [r0 + r1 * 2]
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
>+ lea r2, [r2 + r3 * 2]
>+ lea r0, [r0 + r1 * 2]
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
>+ lea r2, [r2 + r3 * 2]
>+ lea r0, [r0 + r1 * 2]
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
>+ lea r2, [r2 + r3 * 2]
>+ lea r0, [r0 + r1 * 2]
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
>+ lea r2, [r2 + r3 * 2]
>+ lea r0, [r0 + r1 * 2]
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
>+ lea r2, [r2 + r3 * 2]
>+ lea r0, [r0 + r1 * 2]
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
>+ lea r2, [r2 + r3 * 2]
>+ lea r0, [r0 + r1 * 2]
>+ movu m1, [r2]
>+ movu m2, [r2 + r3]
>+ movu m3, [r0]
>+ movu m4, [r0 + r1]
>+ psadbw m1, m3
>+ psadbw m2, m4
>+ paddd m1, m2
>+ paddd m0, m1
> SAD_END_SSE2
>
> ;-----------------------------------------------------------------------------
>@@ -559,7 +601,7 @@
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
> ;-----------------------------------------------------------------------------
>-cglobal pixel_sad_32x8, 4,4,3
>+cglobal pixel_sad_32x8, 4,4,5
> pxor m0, m0
>
> PROCESS_SAD_32x4
>@@ -573,7 +615,7 @@
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
> ;-----------------------------------------------------------------------------
>-cglobal pixel_sad_32x24, 4,5,3
>+cglobal pixel_sad_32x24, 4,5,5
> pxor m0, m0
> mov r4d, 3
> .loop:
>@@ -590,7 +632,7 @@
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
> ;-----------------------------------------------------------------------------
>-cglobal pixel_sad_32x32, 4,5,3
>+cglobal pixel_sad_32x32, 4,5,5
> pxor m0, m0
> mov r4d, 4
> .loop:
>@@ -607,7 +649,7 @@
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
> ;-----------------------------------------------------------------------------
>-cglobal pixel_sad_32x16, 4,4,3
>+cglobal pixel_sad_32x16, 4,4,5
> pxor m0, m0
>
> PROCESS_SAD_32x4
>@@ -623,7 +665,7 @@
> ;-----------------------------------------------------------------------------
> ; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
> ;-----------------------------------------------------------------------------
>-cglobal pixel_sad_32x64, 4,5,3
>+cglobal pixel_sad_32x64, 4,5,5
> pxor m0, m0
> mov r4d, 8
> .loop:
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>