[x265] [PATCH] asm: reduce large code size in pixel_sad_8x32 for better cache performance

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Oct 31 09:37:24 CET 2013


# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1383208572 -19800
#      Thu Oct 31 14:06:12 2013 +0530
# Node ID a3235963489588dea19415cf7d4c6e6f979dba41
# Parent  ec6b4d35f11053b06d0e1ea46df798ff89a4c127
asm: reduce large code size in pixel_sad_8x32 for better cache performance

diff -r ec6b4d35f110 -r a32359634895 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Thu Oct 31 00:09:49 2013 -0500
+++ b/source/common/x86/sad-a.asm	Thu Oct 31 14:06:12 2013 +0530
@@ -862,33 +862,12 @@
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_8x32, 4,4,3
     pxor  m0,  m0
-
+    mov   r4d, 4
+.loop
     PROCESS_SAD_8x4
     PROCESS_SAD_8x4
-    PROCESS_SAD_8x4
-    PROCESS_SAD_8x4
-    PROCESS_SAD_8x4
-    PROCESS_SAD_8x4
-    PROCESS_SAD_8x4
-
-    movq        m1, [r2]
-    movq        m2, [r2 + r3]
-    lea         r2, [r2 + 2 * r3]
-    movq        m3, [r0]
-    movq        m4, [r0 + r1]
-    lea         r0, [r0 + 2 * r1]
-    punpcklqdq  m1, m2
-    punpcklqdq  m3, m4
-    psadbw      m1, m3
-    paddd       m0, m1
-    movq        m1, [r2]
-    movq        m2, [r2 + r3]
-    movq        m3, [r0]
-    movq        m4, [r0 + r1]
-    punpcklqdq  m1, m2
-    punpcklqdq  m3, m4
-    psadbw      m1, m3
-    paddd       m0, m1
+    dec  r4d
+    jnz .loop
 
     movhlps m1,  m0
     paddd   m0,  m1


More information about the x265-devel mailing list