[x265] [PATCH] asm: fix crash when passing unalligned buffer to sad function
Steve Borho
steve at borho.org
Wed Jun 4 18:19:17 CEST 2014
The callers of sad are responsible for making sure the first pointer
(fenc) is properly aligned.
On Wed, Jun 4, 2014 at 10:02 AM, chen <chenm003 at 163.com> wrote:
> it is slower since SAD is bottleneck in ME,
> May you write a unalign version for RDO-RC?
>
>
> At 2014-06-04 19:55:54,yuvaraj at multicorewareinc.com wrote:
>># HG changeset patch
>># User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
>># Date 1401882930 -19800
>># Wed Jun 04 17:25:30 2014 +0530
>># Node ID dd62f0a2a2993d102daab51237dce2cf6f445243
>># Parent 61a02a9c78eb0ec2a1cdaf05562a77b37355432d
>>asm: fix crash when passing unalligned buffer to sad function
>>
>>In psy-rd some input buffers are unalligned which causes crash
>>on sad primitives, fix those crash
>>
>>diff -r 61a02a9c78eb -r dd62f0a2a299 source/common/x86/sad-a.asm
>>--- a/source/common/x86/sad-a.asm Wed Jun 04 15:37:37 2014 +0530
>>+++ b/source/common/x86/sad-a.asm Wed Jun 04 17:25:30 2014 +0530
>>@@ -213,32 +213,40 @@
>> %macro PROCESS_SAD_32x4 0
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>> paddd m0, m1
>> lea r2, [r2 + r3]
>> lea r0, [r0 + r1]
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>> paddd m0, m1
>> lea r2, [r2 + r3]
>> lea r0, [r0 + r1]
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>> paddd m0, m1
>> lea r2, [r2 + r3]
>> lea r0, [r0 + r1]
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>> paddd m0, m1
>> lea r2, [r2 + r3]
>>@@ -319,61 +327,77 @@
>> %macro PROCESS_SAD_64x4 0
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- movu m3, [r2 + 32]
>>- movu m4, [r2 + 48]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>- psadbw m3, [r0 + 32]
>>- psadbw m4, [r0 + 48]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>>- paddd m3, m4
>> paddd m0, m1
>>- paddd m0, m3
>>+ movu m1, [r2 + 32]
>>+ movu m2, [r2 + 48]
>>+ movu m3, [r0 + 32]
>>+ movu m4, [r0 + 48]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>> lea r2, [r2 + r3]
>> lea r0, [r0 + r1]
>>
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- movu m3, [r2 + 32]
>>- movu m4, [r2 + 48]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>- psadbw m3, [r0 + 32]
>>- psadbw m4, [r0 + 48]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>>- paddd m3, m4
>> paddd m0, m1
>>- paddd m0, m3
>>+ movu m1, [r2 + 32]
>>+ movu m2, [r2 + 48]
>>+ movu m3, [r0 + 32]
>>+ movu m4, [r0 + 48]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>> lea r2, [r2 + r3]
>> lea r0, [r0 + r1]
>>
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- movu m3, [r2 + 32]
>>- movu m4, [r2 + 48]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>- psadbw m3, [r0 + 32]
>>- psadbw m4, [r0 + 48]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>>- paddd m3, m4
>> paddd m0, m1
>>- paddd m0, m3
>>+ movu m1, [r2 + 32]
>>+ movu m2, [r2 + 48]
>>+ movu m3, [r0 + 32]
>>+ movu m4, [r0 + 48]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>> lea r2, [r2 + r3]
>> lea r0, [r0 + r1]
>>
>> movu m1, [r2]
>> movu m2, [r2 + 16]
>>- movu m3, [r2 + 32]
>>- movu m4, [r2 + 48]
>>- psadbw m1, [r0]
>>- psadbw m2, [r0 + 16]
>>- psadbw m3, [r0 + 32]
>>- psadbw m4, [r0 + 48]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + 16]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>> paddd m1, m2
>>- paddd m3, m4
>> paddd m0, m1
>>- paddd m0, m3
>>+ movu m1, [r2 + 32]
>>+ movu m2, [r2 + 48]
>>+ movu m3, [r0 + 32]
>>+ movu m4, [r0 + 48]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>> lea r2, [r2 + r3]
>> lea r0, [r0 + r1]
>> %endmacro
>>@@ -382,68 +406,86 @@
>>
>> ;-----------------------------------------------------------------------------
>> ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
>>
>> ;-----------------------------------------------------------------------------
>>-cglobal pixel_sad_16x16, 4,4,8
>>- movu m0, [r2]
>>- movu m1, [r2+r3]
>>- lea r2, [r2+2*r3]
>>- movu m2, [r2]
>>- movu m3, [r2+r3]
>>- lea r2, [r2+2*r3]
>>- psadbw m0, [r0]
>>- psadbw m1, [r0+r1]
>>- lea r0, [r0+2*r1]
>>- movu m4, [r2]
>>- paddw m0, m1
>>- psadbw m2, [r0]
>>- psadbw m3, [r0+r1]
>>- lea r0, [r0+2*r1]
>>- movu m5, [r2+r3]
>>- lea r2, [r2+2*r3]
>>- paddw m2, m3
>>- movu m6, [r2]
>>- movu m7, [r2+r3]
>>- lea r2, [r2+2*r3]
>>- paddw m0, m2
>>- psadbw m4, [r0]
>>- psadbw m5, [r0+r1]
>>- lea r0, [r0+2*r1]
>>- movu m1, [r2]
>>- paddw m4, m5
>>- psadbw m6, [r0]
>>- psadbw m7, [r0+r1]
>>- lea r0, [r0+2*r1]
>>- movu m2, [r2+r3]
>>- lea r2, [r2+2*r3]
>>- paddw m6, m7
>>- movu m3, [r2]
>>- paddw m0, m4
>>- movu m4, [r2+r3]
>>- lea r2, [r2+2*r3]
>>- paddw m0, m6
>>- psadbw m1, [r0]
>>- psadbw m2, [r0+r1]
>>- lea r0, [r0+2*r1]
>>- movu m5, [r2]
>>- paddw m1, m2
>>- psadbw m3, [r0]
>>- psadbw m4, [r0+r1]
>>- lea r0, [r0+2*r1]
>>- movu m6, [r2+r3]
>>- lea r2, [r2+2*r3]
>>- paddw m3, m4
>>- movu m7, [r2]
>>- paddw m0, m1
>>- movu m1, [r2+r3]
>>- paddw m0, m3
>>- psadbw m5, [r0]
>>- psadbw m6, [r0+r1]
>>- lea r0, [r0+2*r1]
>>- paddw m5, m6
>>- psadbw m7, [r0]
>>- psadbw m1, [r0+r1]
>>- paddw m7, m1
>>- paddw m0, m5
>>- paddw m0, m7
>>+cglobal pixel_sad_16x16, 4,4,5
>>+ pxor m0, m0
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>>+ lea r2, [r2 + r3 * 2]
>>+ lea r0, [r0 + r1 * 2]
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>>+ lea r2, [r2 + r3 * 2]
>>+ lea r0, [r0 + r1 * 2]
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>>+ lea r2, [r2 + r3 * 2]
>>+ lea r0, [r0 + r1 * 2]
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>>+ lea r2, [r2 + r3 * 2]
>>+ lea r0, [r0 + r1 * 2]
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>>+ lea r2, [r2 + r3 * 2]
>>+ lea r0, [r0 + r1 * 2]
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>>+ lea r2, [r2 + r3 * 2]
>>+ lea r0, [r0 + r1 * 2]
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>>+ lea r2, [r2 + r3 * 2]
>>+ lea r0, [r0 + r1 * 2]
>>+ movu m1, [r2]
>>+ movu m2, [r2 + r3]
>>+ movu m3, [r0]
>>+ movu m4, [r0 + r1]
>>+ psadbw m1, m3
>>+ psadbw m2, m4
>>+ paddd m1, m2
>>+ paddd m0, m1
>> SAD_END_SSE2
>>
>>
>> ;-----------------------------------------------------------------------------
>>@@ -559,7 +601,7 @@
>>
>> ;-----------------------------------------------------------------------------
>> ; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
>>
>> ;-----------------------------------------------------------------------------
>>-cglobal pixel_sad_32x8, 4,4,3
>>+cglobal pixel_sad_32x8, 4,4,5
>> pxor m0, m0
>>
>> PROCESS_SAD_32x4
>>@@ -573,7 +615,7 @@
>>
>> ;-----------------------------------------------------------------------------
>> ; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
>>
>> ;-----------------------------------------------------------------------------
>>-cglobal pixel_sad_32x24, 4,5,3
>>+cglobal pixel_sad_32x24, 4,5,5
>> pxor m0, m0
>> mov r4d, 3
>> .loop:
>>@@ -590,7 +632,7 @@
>>
>> ;-----------------------------------------------------------------------------
>> ; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
>>
>> ;-----------------------------------------------------------------------------
>>-cglobal pixel_sad_32x32, 4,5,3
>>+cglobal pixel_sad_32x32, 4,5,5
>> pxor m0, m0
>> mov r4d, 4
>> .loop:
>>@@ -607,7 +649,7 @@
>>
>> ;-----------------------------------------------------------------------------
>> ; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
>>
>> ;-----------------------------------------------------------------------------
>>-cglobal pixel_sad_32x16, 4,4,3
>>+cglobal pixel_sad_32x16, 4,4,5
>> pxor m0, m0
>>
>> PROCESS_SAD_32x4
>>@@ -623,7 +665,7 @@
>>
>> ;-----------------------------------------------------------------------------
>> ; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
>>
>> ;-----------------------------------------------------------------------------
>>-cglobal pixel_sad_32x64, 4,5,3
>>+cglobal pixel_sad_32x64, 4,5,5
>> pxor m0, m0
>> mov r4d, 8
>> .loop:
>>_______________________________________________
>>x265-devel mailing list
>>x265-devel at videolan.org
>>https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
More information about the x265-devel
mailing list