[x265] [PATCH] asm: reduce large code size in sad_16xN, sad_32xN for better cache performance

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Oct 31 11:04:56 CET 2013


# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1383212434 -19800
#      Thu Oct 31 15:10:34 2013 +0530
# Node ID 5d6ed411995acd674b838f989385c61039760780
# Parent  a3235963489588dea19415cf7d4c6e6f979dba41
asm: reduce large code size in sad_16xN, sad_32xN for better cache performance

diff -r a32359634895 -r 5d6ed411995a source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Thu Oct 31 14:06:12 2013 +0530
+++ b/source/common/x86/sad-a.asm	Thu Oct 31 15:10:34 2013 +0530
@@ -490,21 +490,7 @@
 
     PROCESS_SAD_16x4
     PROCESS_SAD_16x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + r3]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + r1]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + 2 * r3]
-    lea     r0,  [r0 + 2 * r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + r3]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + r1]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    PROCESS_SAD_16x4
 
     movhlps m1, m0
     paddd   m0, m1
@@ -514,31 +500,14 @@
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x32, 4,4,3
-    pxor m0, m0
-
+cglobal pixel_sad_16x32, 4,5,3
+    pxor m0,  m0
+    mov  r4d, 4
+.loop
     PROCESS_SAD_16x4
     PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + r3]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + r1]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + 2 * r3]
-    lea     r0,  [r0 + 2 * r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + r3]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + r1]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    dec  r4d
+    jnz .loop
 
     movhlps m1, m0
     paddd   m0, m1
@@ -548,39 +517,14 @@
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x64, 4,4,3
-    pxor m0, m0
-
+cglobal pixel_sad_16x64, 4,5,3
+    pxor m0,  m0
+    mov  r4d, 8
+.loop
     PROCESS_SAD_16x4
     PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-    PROCESS_SAD_16x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + r3]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + r1]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + 2 * r3]
-    lea     r0,  [r0 + 2 * r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + r3]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + r1]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    dec  r4d
+    jnz .loop
 
     movhlps m1, m0
     paddd   m0, m1
@@ -618,37 +562,7 @@
     pxor  m0,  m0
 
     PROCESS_SAD_32x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    PROCESS_SAD_32x4
 
     movhlps m1,  m0
     paddd   m0,  m1
@@ -658,45 +572,14 @@
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x24, 4,4,3
+cglobal pixel_sad_32x24, 4,5,3
     pxor  m0,  m0
-
+    mov   r4d, 3
+.loop
     PROCESS_SAD_32x4
     PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    dec r4d
+    jnz .loop
 
     movhlps m1,  m0
     paddd   m0,  m1
@@ -706,47 +589,14 @@
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x32, 4,4,3
+cglobal pixel_sad_32x32, 4,5,3
     pxor  m0,  m0
-
+    mov   r4d, 4
+.loop
     PROCESS_SAD_32x4
     PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    dec r4d
+    jnz .loop
 
     movhlps m1,  m0
     paddd   m0,  m1
@@ -762,37 +612,7 @@
     PROCESS_SAD_32x4
     PROCESS_SAD_32x4
     PROCESS_SAD_32x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    PROCESS_SAD_32x4
 
     movhlps m1,  m0
     paddd   m0,  m1
@@ -802,55 +622,14 @@
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
-cglobal pixel_sad_32x64, 4,4,3
+cglobal pixel_sad_32x64, 4,5,3
     pxor  m0,  m0
-
+    mov   r4d, 8
+.loop
     PROCESS_SAD_32x4
     PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-    PROCESS_SAD_32x4
-
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
-    lea     r2,  [r2 + r3]
-    lea     r0,  [r0 + r1]
-    movu    m1,  [r2]
-    movu    m2,  [r2 + 16]
-    psadbw  m1,  [r0]
-    psadbw  m2,  [r0 + 16]
-    paddd   m1,  m2
-    paddd   m0,  m1
+    dec  r4d
+    jnz .loop
 
     movhlps m1,  m0
     paddd   m0,  m1


More information about the x265-devel mailing list