[x265] [PATCH] asm: fix crash when passing unalligned buffer to sad function
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Wed Jun 4 13:55:54 CEST 2014
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1401882930 -19800
# Wed Jun 04 17:25:30 2014 +0530
# Node ID dd62f0a2a2993d102daab51237dce2cf6f445243
# Parent 61a02a9c78eb0ec2a1cdaf05562a77b37355432d
asm: fix crash when passing unalligned buffer to sad function
In psy-rd some input buffers are unalligned which causes crash
on sad primitives, fix those crash
diff -r 61a02a9c78eb -r dd62f0a2a299 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Wed Jun 04 15:37:37 2014 +0530
+++ b/source/common/x86/sad-a.asm Wed Jun 04 17:25:30 2014 +0530
@@ -213,32 +213,40 @@
%macro PROCESS_SAD_32x4 0
movu m1, [r2]
movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
paddd m0, m1
lea r2, [r2 + r3]
lea r0, [r0 + r1]
movu m1, [r2]
movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
paddd m0, m1
lea r2, [r2 + r3]
lea r0, [r0 + r1]
movu m1, [r2]
movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
paddd m0, m1
lea r2, [r2 + r3]
lea r0, [r0 + r1]
movu m1, [r2]
movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
paddd m0, m1
lea r2, [r2 + r3]
@@ -319,61 +327,77 @@
%macro PROCESS_SAD_64x4 0
movu m1, [r2]
movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- movu m4, [r2 + 48]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
- psadbw m4, [r0 + 48]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
- paddd m3, m4
paddd m0, m1
- paddd m0, m3
+ movu m1, [r2 + 32]
+ movu m2, [r2 + 48]
+ movu m3, [r0 + 32]
+ movu m4, [r0 + 48]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
lea r2, [r2 + r3]
lea r0, [r0 + r1]
movu m1, [r2]
movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- movu m4, [r2 + 48]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
- psadbw m4, [r0 + 48]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
- paddd m3, m4
paddd m0, m1
- paddd m0, m3
+ movu m1, [r2 + 32]
+ movu m2, [r2 + 48]
+ movu m3, [r0 + 32]
+ movu m4, [r0 + 48]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
lea r2, [r2 + r3]
lea r0, [r0 + r1]
movu m1, [r2]
movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- movu m4, [r2 + 48]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
- psadbw m4, [r0 + 48]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
- paddd m3, m4
paddd m0, m1
- paddd m0, m3
+ movu m1, [r2 + 32]
+ movu m2, [r2 + 48]
+ movu m3, [r0 + 32]
+ movu m4, [r0 + 48]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
lea r2, [r2 + r3]
lea r0, [r0 + r1]
movu m1, [r2]
movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- movu m4, [r2 + 48]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
- psadbw m4, [r0 + 48]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
paddd m1, m2
- paddd m3, m4
paddd m0, m1
- paddd m0, m3
+ movu m1, [r2 + 32]
+ movu m2, [r2 + 48]
+ movu m3, [r0 + 32]
+ movu m4, [r0 + 48]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
lea r2, [r2 + r3]
lea r0, [r0 + r1]
%endmacro
@@ -382,68 +406,86 @@
;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x16, 4,4,8
- movu m0, [r2]
- movu m1, [r2+r3]
- lea r2, [r2+2*r3]
- movu m2, [r2]
- movu m3, [r2+r3]
- lea r2, [r2+2*r3]
- psadbw m0, [r0]
- psadbw m1, [r0+r1]
- lea r0, [r0+2*r1]
- movu m4, [r2]
- paddw m0, m1
- psadbw m2, [r0]
- psadbw m3, [r0+r1]
- lea r0, [r0+2*r1]
- movu m5, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m2, m3
- movu m6, [r2]
- movu m7, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m0, m2
- psadbw m4, [r0]
- psadbw m5, [r0+r1]
- lea r0, [r0+2*r1]
- movu m1, [r2]
- paddw m4, m5
- psadbw m6, [r0]
- psadbw m7, [r0+r1]
- lea r0, [r0+2*r1]
- movu m2, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m6, m7
- movu m3, [r2]
- paddw m0, m4
- movu m4, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m0, m6
- psadbw m1, [r0]
- psadbw m2, [r0+r1]
- lea r0, [r0+2*r1]
- movu m5, [r2]
- paddw m1, m2
- psadbw m3, [r0]
- psadbw m4, [r0+r1]
- lea r0, [r0+2*r1]
- movu m6, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m3, m4
- movu m7, [r2]
- paddw m0, m1
- movu m1, [r2+r3]
- paddw m0, m3
- psadbw m5, [r0]
- psadbw m6, [r0+r1]
- lea r0, [r0+2*r1]
- paddw m5, m6
- psadbw m7, [r0]
- psadbw m1, [r0+r1]
- paddw m7, m1
- paddw m0, m5
- paddw m0, m7
+cglobal pixel_sad_16x16, 4,4,5
+ pxor m0, m0
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
SAD_END_SSE2
;-----------------------------------------------------------------------------
@@ -559,7 +601,7 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x8, 4,4,3
+cglobal pixel_sad_32x8, 4,4,5
pxor m0, m0
PROCESS_SAD_32x4
@@ -573,7 +615,7 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x24, 4,5,3
+cglobal pixel_sad_32x24, 4,5,5
pxor m0, m0
mov r4d, 3
.loop:
@@ -590,7 +632,7 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x32, 4,5,3
+cglobal pixel_sad_32x32, 4,5,5
pxor m0, m0
mov r4d, 4
.loop:
@@ -607,7 +649,7 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x16, 4,4,3
+cglobal pixel_sad_32x16, 4,4,5
pxor m0, m0
PROCESS_SAD_32x4
@@ -623,7 +665,7 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x64, 4,5,3
+cglobal pixel_sad_32x64, 4,5,5
pxor m0, m0
mov r4d, 8
.loop:
More information about the x265-devel
mailing list