[x265] [PATCH] asm: reduce large code size in pixel_sad_8x32 for better cache performance
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Oct 31 09:37:24 CET 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1383208572 -19800
# Thu Oct 31 14:06:12 2013 +0530
# Node ID a3235963489588dea19415cf7d4c6e6f979dba41
# Parent ec6b4d35f11053b06d0e1ea46df798ff89a4c127
asm: reduce large code size in pixel_sad_8x32 for better cache performance
diff -r ec6b4d35f110 -r a32359634895 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Thu Oct 31 00:09:49 2013 -0500
+++ b/source/common/x86/sad-a.asm Thu Oct 31 14:06:12 2013 +0530
@@ -862,33 +862,12 @@
;-----------------------------------------------------------------------------
cglobal pixel_sad_8x32, 4,4,3
pxor m0, m0
-
+ mov r4d, 4
+.loop
PROCESS_SAD_8x4
PROCESS_SAD_8x4
- PROCESS_SAD_8x4
- PROCESS_SAD_8x4
- PROCESS_SAD_8x4
- PROCESS_SAD_8x4
- PROCESS_SAD_8x4
-
- movq m1, [r2]
- movq m2, [r2 + r3]
- lea r2, [r2 + 2 * r3]
- movq m3, [r0]
- movq m4, [r0 + r1]
- lea r0, [r0 + 2 * r1]
- punpcklqdq m1, m2
- punpcklqdq m3, m4
- psadbw m1, m3
- paddd m0, m1
- movq m1, [r2]
- movq m2, [r2 + r3]
- movq m3, [r0]
- movq m4, [r0 + r1]
- punpcklqdq m1, m2
- punpcklqdq m3, m4
- psadbw m1, m3
- paddd m0, m1
+ dec r4d
+ jnz .loop
movhlps m1, m0
paddd m0, m1
More information about the x265-devel
mailing list