[x265] [PATCH] asm: reduce large code size in sad_16xN, sad_32xN for better cache performance
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Oct 31 11:04:56 CET 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1383212434 -19800
# Thu Oct 31 15:10:34 2013 +0530
# Node ID 5d6ed411995acd674b838f989385c61039760780
# Parent a3235963489588dea19415cf7d4c6e6f979dba41
asm: reduce large code size in sad_16xN, sad_32xN for better cache performance
diff -r a32359634895 -r 5d6ed411995a source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Thu Oct 31 14:06:12 2013 +0530
+++ b/source/common/x86/sad-a.asm Thu Oct 31 15:10:34 2013 +0530
@@ -490,21 +490,7 @@
PROCESS_SAD_16x4
PROCESS_SAD_16x4
-
- movu m1, [r2]
- movu m2, [r2 + r3]
- psadbw m1, [r0]
- psadbw m2, [r0 + r1]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
- movu m1, [r2]
- movu m2, [r2 + r3]
- psadbw m1, [r0]
- psadbw m2, [r0 + r1]
- paddd m1, m2
- paddd m0, m1
+ PROCESS_SAD_16x4
movhlps m1, m0
paddd m0, m1
@@ -514,31 +500,14 @@
;-----------------------------------------------------------------------------
; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x32, 4,4,3
- pxor m0, m0
-
+cglobal pixel_sad_16x32, 4,5,3
+ pxor m0, m0
+ mov r4d, 4
+.loop
PROCESS_SAD_16x4
PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
-
- movu m1, [r2]
- movu m2, [r2 + r3]
- psadbw m1, [r0]
- psadbw m2, [r0 + r1]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
- movu m1, [r2]
- movu m2, [r2 + r3]
- psadbw m1, [r0]
- psadbw m2, [r0 + r1]
- paddd m1, m2
- paddd m0, m1
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
@@ -548,39 +517,14 @@
;-----------------------------------------------------------------------------
; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x64, 4,4,3
- pxor m0, m0
-
+cglobal pixel_sad_16x64, 4,5,3
+ pxor m0, m0
+ mov r4d, 8
+.loop
PROCESS_SAD_16x4
PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
- PROCESS_SAD_16x4
-
- movu m1, [r2]
- movu m2, [r2 + r3]
- psadbw m1, [r0]
- psadbw m2, [r0 + r1]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
- movu m1, [r2]
- movu m2, [r2 + r3]
- psadbw m1, [r0]
- psadbw m2, [r0 + r1]
- paddd m1, m2
- paddd m0, m1
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
@@ -618,37 +562,7 @@
pxor m0, m0
PROCESS_SAD_32x4
-
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
+ PROCESS_SAD_32x4
movhlps m1, m0
paddd m0, m1
@@ -658,45 +572,14 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x24, 4,4,3
+cglobal pixel_sad_32x24, 4,5,3
pxor m0, m0
-
+ mov r4d, 3
+.loop
PROCESS_SAD_32x4
PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
-
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
@@ -706,47 +589,14 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x32, 4,4,3
+cglobal pixel_sad_32x32, 4,5,3
pxor m0, m0
-
+ mov r4d, 4
+.loop
PROCESS_SAD_32x4
PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
-
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
@@ -762,37 +612,7 @@
PROCESS_SAD_32x4
PROCESS_SAD_32x4
PROCESS_SAD_32x4
-
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
+ PROCESS_SAD_32x4
movhlps m1, m0
paddd m0, m1
@@ -802,55 +622,14 @@
;-----------------------------------------------------------------------------
; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x64, 4,4,3
+cglobal pixel_sad_32x64, 4,5,3
pxor m0, m0
-
+ mov r4d, 8
+.loop
PROCESS_SAD_32x4
PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
- PROCESS_SAD_32x4
-
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
- lea r2, [r2 + r3]
- lea r0, [r0 + r1]
- movu m1, [r2]
- movu m2, [r2 + 16]
- psadbw m1, [r0]
- psadbw m2, [r0 + 16]
- paddd m1, m2
- paddd m0, m1
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
More information about the x265-devel
mailing list