[x265] [PATCH] asm: Optimized sad_64xN for better cache performance. Reduced lea instruction by half. Performance gain is average +5x w.r.t. previous asm code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Oct 31 11:53:50 CET 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1383216695 -19800
# Thu Oct 31 16:21:35 2013 +0530
# Node ID 86ff1a3ec89720a73325148e8ac01ec1dbdab3c2
# Parent 5d6ed411995acd674b838f989385c61039760780
asm: Optimized sad_64xN for better cache performance. Reduced lea instruction by half. Performance gain is average +5x w.r.t. previous asm code.
diff -r 5d6ed411995a -r 86ff1a3ec897 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Thu Oct 31 15:10:34 2013 +0530
+++ b/source/common/x86/sad-a.asm Thu Oct 31 16:21:35 2013 +0530
@@ -329,38 +329,21 @@
paddd m3, m4
paddd m0, m1
paddd m0, m3
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- movu m4, [r2 + 48]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
- psadbw m4, [r0 + 48]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 + 16]
+ movu m3, [r2 + r3 + 32]
+ movu m4, [r2 + r3 + 48]
+ psadbw m1, [r0 + r1]
+ psadbw m2, [r0 + r1 + 16]
+ psadbw m3, [r0 + r1 + 32]
+ psadbw m4, [r0 + r1 + 48]
paddd m1, m2
paddd m3, m4
paddd m0, m1
paddd m0, m3
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
- movu m1, [r2]
- movu m2, [r2 + 16]
- movu m3, [r2 + 32]
- movu m4, [r2 + 48]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- psadbw m3, [r0 + 32]
- psadbw m4, [r0 + 48]
- paddd m1, m2
- paddd m3, m4
- paddd m0, m1
- paddd m0, m3
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
movu m1, [r2]
movu m2, [r2 + 16]
@@ -375,6 +358,20 @@
paddd m0, m1
paddd m0, m3
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 + 16]
+ movu m3, [r2 + r3 + 32]
+ movu m4, [r2 + r3 + 48]
+ psadbw m1, [r0 + r1]
+ psadbw m2, [r0 + r1 + 16]
+ psadbw m3, [r0 + r1 + 32]
+ psadbw m4, [r0 + r1 + 48]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
%endmacro
%macro SAD_W16 0
@@ -660,20 +657,8 @@
pxor m0, m0
PROCESS_SAD_64x4
-
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
PROCESS_SAD_64x4
-
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
PROCESS_SAD_64x4
-
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
PROCESS_SAD_64x4
movhlps m1, m0
@@ -684,27 +669,16 @@
;-----------------------------------------------------------------------------
; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_64x32, 4,4,5
+cglobal pixel_sad_64x32, 4,5,5
pxor m0, m0
- mov r4, 32
-
+ mov r4, 2
.loop
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
- sub r4, 8
- cmp r4, 8
-
-jnz .loop
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
PROCESS_SAD_64x4
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
@@ -714,27 +688,15 @@
;-----------------------------------------------------------------------------
; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_64x48, 4,4,5
+cglobal pixel_sad_64x48, 4,5,5
pxor m0, m0
- mov r4, 48
-
+ mov r4, 4
.loop
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
- sub r4, 8
- cmp r4, 8
-
-jnz .loop
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- PROCESS_SAD_64x4
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
@@ -744,27 +706,16 @@
;-----------------------------------------------------------------------------
; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_64x64, 4,4,5
+cglobal pixel_sad_64x64, 4,5,5
pxor m0, m0
- mov r4, 64
-
+ mov r4, 4
.loop
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
-
- sub r4, 8
- cmp r4, 8
-
-jnz .loop
PROCESS_SAD_64x4
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
PROCESS_SAD_64x4
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
More information about the x265-devel
mailing list