[x265] [PATCH] asm: Optimized sad_48x64: +5x and sad_24x32: +2x asm routines
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Oct 31 12:18:09 CET 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1383218218 -19800
# Thu Oct 31 16:46:58 2013 +0530
# Node ID 515b0af5eb805407d40ead87fd29a8c32118d3a2
# Parent 86ff1a3ec89720a73325148e8ac01ec1dbdab3c2
asm: Optimized sad_48x64: +5x and sad_24x32: +2x asm routines
diff -r 86ff1a3ec897 -r 515b0af5eb80 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Thu Oct 31 16:21:35 2013 +0530
+++ b/source/common/x86/sad-a.asm Thu Oct 31 16:46:58 2013 +0530
@@ -175,39 +175,37 @@
%macro PROCESS_SAD_24x4 0
movu m1, [r2]
movq m2, [r2 + 16]
- lea r2, [r2 + r3]
- movu m3, [r2]
- movq m4, [r2 + 16]
+ movu m3, [r2 + r3]
+ movq m4, [r2 + r3 + 16]
psadbw m1, [r0]
psadbw m3, [r0 + r1]
paddd m0, m1
paddd m0, m3
movq m1, [r0 + 16]
- lea r0, [r0 + r1]
- movq m3, [r0 + 16]
+ movq m3, [r0 + r1 + 16]
punpcklqdq m2, m4
punpcklqdq m1, m3
psadbw m2, m1
paddd m0, m2
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
movu m1, [r2]
movq m2, [r2 + 16]
- lea r2, [r2 + r3]
- movu m3, [r2]
- movq m4, [r2 + 16]
+ movu m3, [r2 + r3]
+ movq m4, [r2 + r3 + 16]
psadbw m1, [r0]
psadbw m3, [r0 + r1]
paddd m0, m1
paddd m0, m3
movq m1, [r0 + 16]
- lea r0, [r0 + r1]
- movq m3, [r0 + 16]
+ movq m3, [r0 + r1 + 16]
punpcklqdq m2, m4
punpcklqdq m1, m3
psadbw m2, m1
paddd m0, m2
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
%endmacro
%macro PROCESS_SAD_32x4 0
@@ -255,8 +253,18 @@
paddd m1, m2
paddd m0, m1
paddd m0, m3
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
+
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 + 16]
+ movu m3, [r2 + r3 + 32]
+ psadbw m1, [r0 + r1]
+ psadbw m2, [r0 + r1 + 16]
+ psadbw m3, [r0 + r1 + 32]
+ paddd m1, m2
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
movu m1, [r2]
movu m2, [r2 + 16]
@@ -267,30 +275,18 @@
paddd m1, m2
paddd m0, m1
paddd m0, m3
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 + 16]
+ movu m3, [r2 + r3 + 32]
+ psadbw m1, [r0 + r1]
+ psadbw m2, [r0 + r1 + 16]
+ psadbw m3, [r0 + r1 + 32]
paddd m1, m2
paddd m0, m1
paddd m0, m3
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
- movu m1, [r2]
- movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
- paddd m1, m2
- paddd m0, m1
- paddd m0, m3
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
%endmacro
%macro PROCESS_SAD_8x4 0
@@ -725,27 +721,17 @@
;-----------------------------------------------------------------------------
; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_48x64, 4,4,5
+cglobal pixel_sad_48x64, 4,5,5
pxor m0, m0
- mov r4, 64
+ mov r4d, 4
.loop
PROCESS_SAD_48x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
PROCESS_SAD_48x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
- sub r4, 8
- cmp r4, 8
-
-jnz .loop
PROCESS_SAD_48x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
PROCESS_SAD_48x4
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
@@ -755,24 +741,17 @@
;-----------------------------------------------------------------------------
; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_24x32, 4,4,4
+cglobal pixel_sad_24x32, 4,5,4
pxor m0, m0
- mov r4, 32
+ mov r4d, 2
.loop
PROCESS_SAD_24x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
PROCESS_SAD_24x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- sub r4, 8
- cmp r4, 8
+ PROCESS_SAD_24x4
+ PROCESS_SAD_24x4
+ dec r4d
jnz .loop
- PROCESS_SAD_24x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- PROCESS_SAD_24x4
movhlps m1, m0
paddd m0, m1
More information about the x265-devel
mailing list