[x265] [PATCH] asm: pixel_sad_64xN reduce large code size

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Dec 3 08:56:16 CET 2013


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1386056358 -19800
#      Tue Dec 03 13:09:18 2013 +0530
# Node ID 4e00525b99e44f8054020e6562588c5509b6a784
# Parent  ca7bd538e052d104b1b333691836db37739cfdf0
asm: pixel_sad_64xN reduce large code size

diff -r ca7bd538e052 -r 4e00525b99e4 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Mon Dec 02 20:26:19 2013 -0600
+++ b/source/common/x86/sad-a.asm	Tue Dec 03 13:09:18 2013 +0530
@@ -374,7 +374,8 @@
     paddd   m3,  m4
     paddd   m0,  m1
     paddd   m0,  m3
-
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
 %endmacro
 
 %macro SAD_W16 0
@@ -660,20 +661,8 @@
     pxor  m0,  m0
 
     PROCESS_SAD_64x4
-
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
     PROCESS_SAD_64x4
-
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
     PROCESS_SAD_64x4
-
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
     PROCESS_SAD_64x4
 
     movhlps m1,  m0
@@ -686,25 +675,14 @@
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_64x32, 4,5,5
     pxor  m0,  m0
-    mov   r4,  32
+    mov   r4,  4
 
 .loop
     PROCESS_SAD_64x4
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
+    PROCESS_SAD_64x4
 
-    PROCESS_SAD_64x4
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
-    sub   r4,  8
-    cmp   r4,  8
-
-jnz .loop
-    PROCESS_SAD_64x4
-    lea   r2,  [r2 + r3]
-    lea   r0,  [r0 + r1]
-    PROCESS_SAD_64x4
+    dec   r4
+    jnz   .loop
 
     movhlps m1,  m0
     paddd   m0,  m1
@@ -716,25 +694,13 @@
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_64x48, 4,5,5
     pxor  m0,  m0
-    mov   r4,  48
+    mov   r4,  6
 
 .loop
     PROCESS_SAD_64x4
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
     PROCESS_SAD_64x4
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
-    sub   r4,  8
-    cmp   r4,  8
-
-jnz .loop
-    PROCESS_SAD_64x4
-    lea   r2,  [r2 + r3]
-    lea   r0,  [r0 + r1]
-    PROCESS_SAD_64x4
+    dec     r4d
+    jnz     .loop
 
     movhlps m1,  m0
     paddd   m0,  m1
@@ -746,25 +712,13 @@
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_64x64, 4,5,5
     pxor  m0,  m0
-    mov   r4,  64
+    mov   r4,  8
 
 .loop
     PROCESS_SAD_64x4
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
     PROCESS_SAD_64x4
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-
-    sub   r4,  8
-    cmp   r4,  8
-
-jnz .loop
-    PROCESS_SAD_64x4
-    lea   r2,  [r2 + r3]
-    lea   r0,  [r0 + r1]
-    PROCESS_SAD_64x4
+    dec   r4
+    jnz   .loop
 
     movhlps m1,  m0
     paddd   m0,  m1


More information about the x265-devel mailing list